In [1]:
import numpy as np
import pandas as pd
import cobra.io

In [2]:
model = cobra.io.load_json_model('reduced_iJN1463.json')
reference_flux = pd.read_csv('reference_fluxes_GB032gfg.csv', index_col=0)

In [3]:
N = cobra.util.create_stoichiometric_matrix(model)
v_star = reference_flux.values.flatten()

N[:, v_star < 0] = -1 * N[:, v_star < 0]
v_star = np.abs(v_star)

In [4]:
np.array(list(set(np.arange(N.shape[1])).difference(
    np.unique(N, return_index=True, axis=1)[1])))

array([545])

In [5]:
model

0,1
Name,iJN1463
Memory address,0x07f06c4148090
Number of metabolites,665
Number of reactions,716
Number of groups,0
Objective expression,1.0*BIOMASS_KT2440_WT3 - 1.0*BIOMASS_KT2440_WT3_reverse_d86d5
Compartments,"cytosol, periplasm, extracellular space"


In [6]:
met_labels = {m.id for m in model.metabolites}

external_mets = {model.metabolites.index(m) for m in model.metabolites.query('e', 'compartment')}
boundary_mets = {model.metabolites.index(met.id[:-2] + '_c') if (met.id[:-2] + '_c' in met_labels) else model.metabolites.index(met.id[:-2] + '_p') for met in model.metabolites.query('e', 'compartment')}
measured_mets = {model.metabolites.index(model.metabolites.get_by_id(mid)) for mid
                 in pd.read_csv('temp_data/measured_mets.csv').values.flatten()}
biomass_mets = {model.metabolites.index(m) for m in model.reactions.BIOMASS_KT2440_WT3.reactants}

protected_mets = external_mets | boundary_mets | measured_mets | biomass_mets

# biomass_prods = {model.metabolites.index(m) for m in model.reactions.BIOMASS_KT2440_WT3.products}

protected_rxns = {model.reactions.index(r) for r in model.exchanges + [model.reactions.BIOMASS_KT2440_WT3]}

print(len(protected_mets))
print(len(protected_rxns))

181
36


In [7]:
m, n = N.shape

L = np.eye(m)
R = np.eye(n)

assert np.allclose(L @ N @ R @ v_star, 0)

In [8]:
Rinv = R

assert np.allclose(L @ N @ R @ v_star, 0)
assert np.allclose((Rinv @ R), np.eye(n))

In [9]:
model.reactions.index(model.reactions.BIOMASS_KT2440_WT3)

44

In [10]:
def remove_single_stream_metabolite(met_to_remove, direction, L, N, R, Rinv):
    N_sign = np.sign(N).astype(int)

    rxn_to_remove = np.where(N_sign[met_to_remove] == direction)[0][0]
    rxn_to_remove_origindex = (R @ np.arange(n))[rxn_to_remove].astype(int)
    assert rxn_to_remove_origindex != 44, f"Biomass removed with met {(np.arange(m) @ L)[met_to_remove]}"

    rxns_to_group = np.where(N_sign[met_to_remove] == -direction)[0]

#     print('Reaction to remove:')
#     print(np.where(N[:, rxn_to_remove]))
#     print(N[np.where(N[:, rxn_to_remove])[0], rxn_to_remove])

    assert len(rxns_to_group) == 1

    for rxn in rxns_to_group:
    
#         print('Reaction to group (before):')    
#         print(np.where(N[:, rxn]))
#         print(N[np.where(N[:, rxn])[0], rxn])   
    
        fraction = -N[met_to_remove, rxn] / N[met_to_remove, rxn_to_remove]
        N[:, rxn] += fraction * N[:, rxn_to_remove]
        Rinv[:, rxn] += fraction * Rinv[:, rxn_to_remove]
        
#         print('Reaction to group (after):')        
#         print(np.where(N[:, rxn]))
#         print(N[np.where(N[:, rxn])[0], rxn])

    N = np.delete(N, rxn_to_remove, 1)
    Rinv = np.delete(Rinv, rxn_to_remove, 1)    
    R = np.delete(R, rxn_to_remove, 0)

    N = np.delete(N, met_to_remove, 0)
    L = np.delete(L, met_to_remove, 1)

    assert np.allclose(L @ N @ R @ v_star, 0)
    assert np.allclose(Rinv @ R @ v_star, v_star)
    return L, N, R, Rinv


def get_duplicate_metabolites(N):
    return np.array(list(set(np.arange(N.shape[0])).difference(
        np.unique(N, return_index=True, axis=0)[1])))


def get_duplicate_reactions(N):
    return np.array(list(set(np.arange(N.shape[1])).difference(
        np.unique(N, return_index=True, axis=1)[1])))


# def remove_duplicate_reaction(rxn_to_remove, v_star, L, N, R, Rinv):
#     rxn = [i for i, row in enumerate(N.T) if (row == N[:, rxn_to_remove]).all() and i != rxn_to_remove][0]

#     rxn_to_remove_origindex = (R @ np.arange(n))[rxn_to_remove].astype(int)
#     rxn_origindex = (R @ np.arange(n))[rxn].astype(int)

#     N_test = np.delete(N, rxn_to_remove, 1)
#     R_test = np.delete(R, rxn_to_remove, 0)

#     R_test[rxn] *= (v_star[rxn_to_remove_origindex] + v_star[rxn_origindex]) / v_star[rxn_origindex]

#     assert np.allclose(L @ N_test @ R_test @ Rsign @ v_star, 0)

In [11]:
# Protected metabolite mask
mask = np.ones(m, dtype=bool)
mask[list(protected_mets)] = False

from tqdm import tqdm
t = tqdm()

while True:
    
    t.set_postfix(shape=f"{N.shape}")
    t.update(1)
    
    N_sign = np.sign(N)
    

    Lmask = mask @ L.astype(bool)
    
    to_remove_source = np.where(
        ((N_sign == 1).sum(1) == 1) & 
        ((N_sign == -1).sum(1) == 1) &         
        Lmask)[0]
    rxn_to_remove_source = pd.Series(
        [(R @ np.arange(n))[np.where(N_sign[met_to_remove] == 1)[0][0]].astype(int)
         for met_to_remove in to_remove_source])
    to_remove_source = to_remove_source[~rxn_to_remove_source.isin(protected_rxns)]

    if np.any(to_remove_source):
        L, N, R, Rinv = remove_single_stream_metabolite(to_remove_source[0], 1, L, N, R, Rinv)  
        continue
        
    to_remove_sink = np.where(
        ((N_sign == 1).sum(1) == 1) &
        ((N_sign == -1).sum(1) == 1) &
        Lmask)[0]
    rxn_to_remove_sink= pd.Series(
        [(R @ np.arange(n))[np.where(N_sign[met_to_remove] == -1)[0][0]].astype(int)
         for met_to_remove in to_remove_sink])
    to_remove_sink = to_remove_sink[~rxn_to_remove_sink.isin(protected_rxns)]
    
    if np.any(to_remove_sink):
        L, N, R, Rinv = remove_single_stream_metabolite(to_remove_sink[0], -1, L, N, R, Rinv)
        continue 
        
    duplicated_mets = get_duplicate_metabolites(N)
    
    if len(duplicated_mets) != 0:
        # Remove protected mets
        duplicated_mets = duplicated_mets[Lmask[duplicated_mets]]  
    
    if len(duplicated_mets) != 0:
        N = np.delete(N, duplicated_mets[0], 0)
        L = np.delete(L, duplicated_mets[0], 1)
        assert np.allclose(L @ N @ R @ v_star, 0)
        continue

#     duplicated_rxns = get_duplicate_reactions(N)
    
#     if len(duplicated_rxns) != 0:
#         N = np.delete(N, duplicated_rxns[0], 1)
#         R = np.delete(R, duplicated_rxns[0], 0)
#         Rinv = np.delete(Rinv, duplicated_rxns[0], 1)
        
#         assert np.allclose(L @ N @ R @ Rsign @ v_star, 0)
#         continue        
        
    else:
        break

360it [00:43, 18.45it/s, shape=(305, 361)]

In [12]:
N.shape

(305, 361)

In [13]:
[model.metabolites[i] for i in (np.arange(m) @ L).astype(int)][:10]

[<Metabolite 10fthf_c at 0x7f06c4148150>,
 <Metabolite 13dpg_c at 0x7f06c4148410>,
 <Metabolite 2obut_c at 0x7f06c4148810>,
 <Metabolite 3mob_c at 0x7f06c4148750>,
 <Metabolite 34dhbz_c at 0x7f06c4148e90>,
 <Metabolite 34dhbz_e at 0x7f06c4148f90>,
 <Metabolite 25aics_c at 0x7f06c414cd90>,
 <Metabolite 3dhsk_c at 0x7f06c414cf10>,
 <Metabolite 26dap__M_c at 0x7f06c4154050>,
 <Metabolite 3pg_c at 0x7f06c4154190>]

In [14]:
rxn_labels_original = pd.Series([r.id for r in model.reactions])
rxn_labels_compressed = pd.Series([model.reactions[i].id for i in (R @ np.arange(n)).astype(int)])
met_labels_compressed = pd.Series([model.metabolites[i].id for i in (np.arange(m) @ L).astype(int)])

In [15]:
rxn_labels_compressed

0        2AACLPGT160
1        2AACLPGT180
2      2AACLPPEAT160
3      2AACLPPEAT180
4         2DHGLCNtex
           ...      
356      EX_acon_T_e
357    s7p_transport
358         EX_s7p_e
359           PHETA1
360       CLPNS140pp
Length: 361, dtype: object

In [16]:
def iter_matches():
    for i, rxn_original in enumerate(model.reactions):
        for rxn_index_compressed in np.where(np.sign(Rinv[i]))[0]:
            yield (rxn_labels_compressed.loc[rxn_index_compressed], rxn_original.id)

In [17]:
df = pd.DataFrame(iter_matches(), columns=['compressed', 'original'])

In [18]:
len(df.compressed.unique())

361

In [19]:
import pickle

with open('compression_3.p', 'wb') as f:
    pickle.dump({'L': L, 'N': N, 'R': R, 'Rinv': Rinv, 'm': m, 'n': n, 'matches': df, 'v_star': R @ v_star,
                 'met_labels': met_labels_compressed, 'rxn_labels': rxn_labels_compressed}, f)

In [20]:
# N[:, rxn] += N[:, rxn_to_remove]
# Rinv[:, rxn] += Rinv[:, rxn_to_remove]

# N = np.delete(N, rxn_to_remove, 1)
# Rinv = np.delete(Rinv, rxn_to_remove, 1)    
# R = np.delete(R, rxn_to_remove, 0)

# assert np.allclose(L @ N @ R @ Rsign @ v_star, 0)
# assert np.allclose(Rinv @ R @ Rsign @ v_star, Rsign @ v_star)

361it [01:00, 18.45it/s, shape=(305, 361)]