## This notebook handles the pre-processing needed to modify thereaction bounds in a cobra model to facilitate running Eflux2

In [4]:
import os
import sys
#sys.path.append('../src')
#sys.path.append('/Users/mahs128/Repos/CONCERTO')
#from concerto.helpers.load_model_from_git import load_model_from_git
from eflux2 import EFlux2
import cobra
import pandas as pd
import numpy as np
import gurobipy
from fba_utils import get_flux_bounds, convert_transcriptomics_to_enzyme_activity

### 1. Obtain optimized fluxes for reactions corresponding to criteria used to select the reference strain, and (reasonably) tight bounds on the fluxes for all other reactions in the model

In [5]:
os.getcwd()

'/Users/zuck016/Projects/PPI/syn_bmca/data'

In [6]:
# Load SBML model
# syn_model = cobra.io.read_sbml_model('../models/iJB785_w_sucrose_transport.xml')
syn_model = cobra.io.read_sbml_model('../models/syn_elong.xml')

In [4]:
# Get model with optimized fluxes for reactions of interest, 
# and a dataframe of flux bounds for all other reactions
reactions_of_interest = ['EX_sucr_e', 'EX_co2_e', 'BIOMASS__1']
opt_model, flux_bounds = get_flux_bounds(model=syn_model, rxns_of_interest=reactions_of_interest)

Set parameter TokenServer to value "leghorn.emsl.pnl.gov"
Read LP format model from file /var/folders/k9/b8pxky2572sdtgy2vnxhcljw0000gn/T/tmp6ikyt689.lp
Reading time = 0.00 seconds
: 928 rows, 2181 columns, 9211 nonzeros
Set parameter TokenServer to value "leghorn.emsl.pnl.gov"
Read LP format model from file /var/folders/k9/b8pxky2572sdtgy2vnxhcljw0000gn/T/tmpj6qghcl4.lp
Reading time = 0.00 seconds
: 928 rows, 2181 columns, 9211 nonzeros
Set parameter TokenServer to value "leghorn.emsl.pnl.gov"
Read LP format model from file /var/folders/k9/b8pxky2572sdtgy2vnxhcljw0000gn/T/tmpgtz489o1.lp
Reading time = 0.01 seconds
: 928 rows, 2181 columns, 9211 nonzeros
Set parameter TokenServer to value "leghorn.emsl.pnl.gov"
Read LP format model from file /var/folders/k9/b8pxky2572sdtgy2vnxhcljw0000gn/T/tmpwpy6dkxh.lp
Reading time = 0.00 seconds
: 928 rows, 2181 columns, 9211 nonzeros
Set parameter TokenServer to value "leghorn.emsl.pnl.gov"
Read LP format model from file /var/folders/k9/b8pxky2572s

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  flux_bounds[c][r] = 0


In [4]:
flux_bounds

Unnamed: 0,minimum,maximum
EX_gln__L_e,0.000000,0.000000
EX_hco3_e,-1.990000,-1.989450
EX_mn2_e,0.000000,0.000000
EX_arg__L_e,0.000000,0.000000
ADPT,0.000006,0.000006
...,...,...
ZDS,0.000000,0.000339
RDXRr,0.000000,0.000000
MPTSS,0.000000,0.000000
NOR,0.000000,0.000339


In [5]:
check_1a = flux_bounds[(flux_bounds > 0) & (flux_bounds < 1e-10)]
check_1a.any()

minimum    False
maximum    False
dtype: bool

In [6]:
check_1b = flux_bounds[(flux_bounds > -1e-10) & (flux_bounds < 0)]
check_1b.any()

minimum    False
maximum    False
dtype: bool

In [8]:
for c in flux_bounds.columns:
    for r in flux_bounds.index:
        if (flux_bounds[c][r] > -1e-10) and (flux_bounds[c][r] < 1e-10):
            flux_bounds[c][r] = 0

flux_bounds

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  flux_bounds[c][r] = 0


Unnamed: 0,minimum,maximum
EX_gln__L_e,0.000000,0.000000
EX_hco3_e,-1.990000,-1.989450
EX_mn2_e,0.000000,0.000000
EX_arg__L_e,0.000000,0.000000
ADPT,0.000006,0.000006
...,...,...
ZDS,0.000000,0.000339
RDXRr,0.000000,0.000000
MPTSS,0.000000,0.000000
NOR,0.000000,0.000339


In [9]:
check_2a = flux_bounds[(flux_bounds > 0) & (flux_bounds < 1e-10)]
check_2a.any()

minimum    False
maximum    False
dtype: bool

In [10]:
check_2b = flux_bounds[(flux_bounds > -1e-10) & (flux_bounds < 0)]
check_2b.any()

minimum    False
maximum    False
dtype: bool

In [11]:
all_rxns = syn_model.reactions
print("# of reactions in the model:", len(all_rxns))

# of reactions in the model: 1090


In [12]:
zero_flux_from_FVA = [i for i in flux_bounds.index[(flux_bounds.minimum==0) & (flux_bounds.maximum==0)]]
print("# of 0-flux reactions from FVA:", len(zero_flux_from_FVA))
print("Percent of fluxes in the model with of 0-values from FVA:", round(100*len(zero_flux_from_FVA)/len(all_rxns),2),"%")

# of 0-flux reactions from FVA: 271
Percent of fluxes in the model with of 0-values from FVA: 24.86 %


In [13]:
neg_flux_from_FVA = [i for i in flux_bounds.index[(flux_bounds.minimum<0) & (flux_bounds.maximum<=0)]]
print("# of negative-flux reactions from FVA:", len(neg_flux_from_FVA))
print("Percent of fluxes in the model with of negative-values from FVA:", round(100*len(neg_flux_from_FVA)/len(all_rxns),2),"%")

# of negative-flux reactions from FVA: 66
Percent of fluxes in the model with of negative-values from FVA: 6.06 %


In [14]:
partneg_flux_from_FVA = [i for i in flux_bounds.index[(flux_bounds.minimum<0) & (flux_bounds.maximum>0)]]
print("# of negative-lower-bound-flux reactions from FVA:", len(partneg_flux_from_FVA))
print("Percent of fluxes in the model with of negative-lower-bound from FVA:", round(100*len(partneg_flux_from_FVA)/len(all_rxns),2),"%")

# of negative-lower-bound-flux reactions from FVA: 53
Percent of fluxes in the model with of negative-lower-bound from FVA: 4.86 %


In [15]:
pos_flux_from_FVA = [i for i in flux_bounds.index[(flux_bounds.minimum>=0) & (flux_bounds.maximum>0)]]
print("# of positive-flux reactions from FVA:", len(pos_flux_from_FVA))
print("Percent of fluxes in the model with of positive-values from FVA:", round(100*len(pos_flux_from_FVA)/len(all_rxns),2),"%")

# of positive-flux reactions from FVA: 697
Percent of fluxes in the model with of positive-values from FVA: 63.94 %


In [18]:
print("All-zero")
display(flux_bounds.loc[zero_flux_from_FVA])
print("Min of minimum: ", flux_bounds.loc[zero_flux_from_FVA].minimum.min())
print("Max of minimum: ", flux_bounds.loc[zero_flux_from_FVA].minimum.max())
print("Min of maximum: ", flux_bounds.loc[zero_flux_from_FVA].maximum.min())
print("Max of maximum: ", flux_bounds.loc[zero_flux_from_FVA].maximum.max())

print("------")

print("All negative")
display(flux_bounds.loc[neg_flux_from_FVA])
print("Min of minimum: ", flux_bounds.loc[neg_flux_from_FVA].minimum.min())
print("Max of minimum: ", flux_bounds.loc[neg_flux_from_FVA].minimum.max())
print("Min of maximum: ", flux_bounds.loc[neg_flux_from_FVA].maximum.min())
print("Max of maximum: ", flux_bounds.loc[neg_flux_from_FVA].maximum.max())

print("------")

print("part negative")
display(flux_bounds.loc[partneg_flux_from_FVA])
print("Min of minimum: ", flux_bounds.loc[partneg_flux_from_FVA].minimum.min())
print("Max of minimum: ", flux_bounds.loc[partneg_flux_from_FVA].minimum.max())
print("Min of maximum: ", flux_bounds.loc[partneg_flux_from_FVA].maximum.min())
print("Max of maximum: ", flux_bounds.loc[partneg_flux_from_FVA].maximum.max())

print("------")

print("all positive")
display(flux_bounds.loc[pos_flux_from_FVA])
print("Min of minimum: ", flux_bounds.loc[pos_flux_from_FVA].minimum.min())
print("Max of minimum: ", flux_bounds.loc[pos_flux_from_FVA].minimum.max())
print("Min of maximum: ", flux_bounds.loc[pos_flux_from_FVA].maximum.min())
print("Max of maximum: ", flux_bounds.loc[pos_flux_from_FVA].maximum.max())


# [i for i in all_rxns if i not in (zero_flux_from_FVA + neg_flux_from_FVA + partneg_flux_from_FVA + pos_flux_from_FVA)]

# len(zero_flux_from_FVA) + len(neg_flux_from_FVA) + len(partneg_flux_from_FVA) + len(pos_flux_from_FVA)

All-zero


Unnamed: 0,minimum,maximum
EX_gln__L_e,0.0,0.0
EX_mn2_e,0.0,0.0
EX_arg__L_e,0.0,0.0
AOXPBDC,0.0,0.0
CBMD,0.0,0.0
...,...,...
MOADSUx,0.0,0.0
BM_CARB,0.0,0.0
RDXRr,0.0,0.0
MPTSS,0.0,0.0


Min of minimum:  0.0
Max of minimum:  0.0
Min of maximum:  0.0
Max of maximum:  0.0
------
All negative


Unnamed: 0,minimum,maximum
EX_hco3_e,-1.990000e+00,-1.989450
H2Otu_syn,-5.000000e+02,0.000000
HSDy,-1.853885e-02,0.000000
ASPTA,-5.861985e+02,-0.096128
ACKr,-2.648258e-01,0.000000
...,...,...
NDPK6,-2.847819e-04,0.000000
TYRTA,-7.255589e-03,-0.007256
Ktu,-8.339140e-08,0.000000
EX_no2_e,-4.061676e-01,0.000000


Min of minimum:  -1000.0
Max of minimum:  -6.4147234298795134e-09
Min of maximum:  -2.316694352323797
Max of maximum:  0.0
------
part negative


Unnamed: 0,minimum,maximum
ORNTA,-0.012535,0.012759
HSTPT,-999.995373,1000.0
TALA,-999.046534,999.145724
HEMEAS_1,-1000.0,1000.0
O2tpp,-53.446334,45.808232
H2Otex,-98.095114,100.0
PPK,-1000.0,1000.0
PPK2,-1000.0,1000.0
GLYDHDA,-586.125309,1000.0
O2tex,-53.446334,45.808232


Min of minimum:  -1000.0
Max of minimum:  -0.00013755882582812753
Min of maximum:  0.0012236581351194934
Max of maximum:  1000.0
------
all positive


Unnamed: 0,minimum,maximum
ADPT,0.000006,0.000006
O2tcx,0.022902,5.025126
DNTPPA,0.000004,0.000004
BIOMASS_PIGMENTS,0.000000,0.000062
DTMPK,0.000285,0.000285
...,...,...
BM_PROTEIN,0.000000,0.027556
FACOAL140i,0.000000,999.663261
GLYCOR_nadp,0.000000,1000.000000
ZDS,0.000000,0.000339


Min of minimum:  0.0
Max of minimum:  4.557540267089984
Min of maximum:  6.4147234191125335e-09
Max of maximum:  1000.0


In [19]:
zero_neg_overlap = [i for i in zero_flux_from_FVA if i in neg_flux_from_FVA]
print("zero_neg_overlap:", zero_neg_overlap)

zero_partneg_overlap = [i for i in zero_flux_from_FVA if i in partneg_flux_from_FVA]
print("zero_partneg_overlap:", zero_partneg_overlap)

zero_pos_overlap = [i for i in zero_flux_from_FVA if i in pos_flux_from_FVA]
print("zero_pos_overlap:", zero_pos_overlap)

neg_partneg_overlap = [i for i in neg_flux_from_FVA if i in partneg_flux_from_FVA]
print("neg_partneg_overlap:", neg_partneg_overlap)

neg_pos_overlap = [i for i in neg_flux_from_FVA if i in pos_flux_from_FVA]
print("neg_pos_overlap:", neg_pos_overlap)

partneg_pos_overlap = [i for i in partneg_flux_from_FVA if i in pos_flux_from_FVA]
print("partneg_pos_overlap:", partneg_pos_overlap)

# (zero_flux_from_FVA + neg_flux_from_FVA + partneg_flux_from_FVA + pos_flux_from_FVA)
# [i for i in all_rxns if i not in (zero_flux_from_FVA + neg_flux_from_FVA + partneg_flux_from_FVA + pos_flux_from_FVA)]

zero_neg_overlap: []
zero_partneg_overlap: []
zero_pos_overlap: []
neg_partneg_overlap: []
neg_pos_overlap: []
partneg_pos_overlap: []


In [20]:
some_zero = zero_flux_from_FVA + neg_flux_from_FVA + partneg_flux_from_FVA + pos_flux_from_FVA
len(some_zero)

1087

In [21]:
missed_rxn = [r for r in flux_bounds.index if r not in some_zero]
flux_bounds.loc[missed_rxn]

Unnamed: 0,minimum,maximum


 ### 2. Next, convert the transcriptomics data to enzyme data (Take the min of subunits, take the sum of isozymes)

In [7]:
# Load transciptomics data
transcriptomics_fname = "processed_data/cleaned_transcriptomics.csv"
transcriptomics_df = pd.read_csv(transcriptomics_fname, index_col="Label")
transcriptomics_df.head()

Unnamed: 0_level_0,Se_axen_d4_1,Se_axen_d4_2,Se_axen_d4_3,Se_axen_d6_1,Se_axen_d6_2,Se_axen_d6_3,Se_axen_d8_1,Se_axen_d8_2,Se_axen_d8_3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SYNPCC7942_RS00005,16290,17761,15101,14567,13967,12653,7016,8855,8689
SYNPCC7942_RS00010,7185,7502,6565,10086,7546,7705,3747,6670,6960
SYNPCC7942_RS00015,24176,26304,22781,23926,21306,20412,13440,17253,17053
SYNPCC7942_RS00020,35119,35145,25895,37701,34175,30569,24835,39769,30280
SYNPCC7942_RS00025,6891,7803,6607,6347,6284,6844,3326,4146,3180


In [8]:
# Set reference strain
ref_rep = transcriptomics_df.columns[8] # 'Se_axen_d8_3'
ref_rep

'Se_axen_d8_3'

# Function to create dictionary of reactions to isozyme sets (corresponding genes from gene reaction rules)
def get_gpr_dict(model):
    """ Returns the gene reaction rule (GPR) for each reaction in the model."""
    # Parse GPR into a dict containing isozymes (separated by 'or')
    # Each isozyme has a set of subunits (separated by 'and')
    gpr_dict = dict()
    for r in model.reactions:
        if r.gene_reaction_rule:
            isozymes = set()
            for isozyme in [isozyme.strip('() ') for isozyme in r.gene_reaction_rule.split(' or ')]:
                isozymes.add(frozenset(gene.strip('() ') for gene in isozyme.split(' and ')))
            gpr_dict[r] = isozymes
    
    return gpr_dict



gpr_dict = get_gpr_dict(syn_model)
display(gpr_dict)

# Check which genes (listed in observed transcriptomics data) are missing from the cobra model
# list_of_gprs_in_model = [str(syn_model.reactions.get_by_id(r.id).gpr).split(" and ") for r in syn_model.reactions]
list_of_gprs_in_model = [g.id for g in syn_model.genes]

genes_not_in_model = [g for g in transcriptomics_df.index if g not in list_of_gprs_in_model]
genes_not_in_data = [g for g in list_of_gprs_in_model if g not in transcriptomics_df.index]
genes_in_model_and_data = [g for g in list_of_gprs_in_model if g in transcriptomics_df.index]


print('# of genes in model:', len(list_of_gprs_in_model))
print('# of genes in data:', len(transcriptomics_df.index))
print('# of genes_not_in_model:', len(genes_not_in_model))
print('# of genes_not_in_data:', len(genes_not_in_data))
print('# of genes_in_model_and_data:', len(genes_in_model_and_data))

print('# of reactions in model:', len([r.id for r in syn_model.reactions]))

genes_not_in_data

# Code shared by Jeremy in Teams chat...
from cobra import Reaction, Gene

def gene_expression_to_enzyme_activity(gpr: dict[Reaction, list[list[Gene]]], expression: dict[Gene, float]):
    """Map gene expression to enzyme activity
    inputs:
        gpr: dictionary of reactions (keys) to list of list of genes (values) for the correpsonding gene reaction rule
        expression: dictionary of gene names (keys) to values from [likely] observed transcriptomics data
    outputs:
        enzyme_activity: dictionary of reactions (keys) to corresponding isozyme activity from observed data (value)
    """
    
    enzyme_activity = {}
    for rxn in syn_model.reactions:
      # Initialize enzyme_activity for this reaction to 0-value
      # Note: 0-value preserved IF this reaction doesn't have any genes in its gene reaction rule
      # Obvious example: Exchange/transport reactions don't have corresponding genes in their reaction rule, so the 0-value is preserved
      enzyme_activity[rxn] = 0.0
      
      if rxn in gpr: # ensure rxn has a gene_reaction_rule defined
        for isozyme in gpr[rxn]:
          # Initialize isozyme_activity for this isozyme to infinity
          # Note: infinity-value is preserved IF this isozyme is not present in the observed transcriptomics data
          isozyme_activity = np.inf
          for gene in isozyme:

            if gene in expression: # temporary fix: ensure gene is included in observed data
              
              isozyme_activity = np.min([isozyme_activity, expression[gene]])
          enzyme_activity[rxn] += isozyme_activity
    return enzyme_activity

# Function to convert transciptomics data to enzyme activity
def convert_transcriptomics_to_enzyme_activity(transcriptomics_data: pd.DataFrame, gpr: dict[Reaction, list[list[Gene]]]):
    """Convert transcriptomics data to enzyme activity
    inputs:
        transcriptomics_data: dataframe of transcriptomics data
        gpr: dictionary of reactions (keys) to list of list of genes (values) for the correpsonding gene reaction rule
    outputs:
        enzyme_activity_df: dataframe of enzyme activity converted from transcriptomics data
    """

    # Initialize empty dataframe
    enzyme_activity_df = pd.DataFrame()

    # Loop through each strain to convert each column of transcriptomics data
    for this_strain in transcriptomics_data.columns:
        # Create dict of genes and corresponding float values using trancsciptomics data
        expression_dict = {g: transcriptomics_data.loc[g][this_strain] for g in transcriptomics_data.index}
        expr_dict_keys = [kz for kz in expression_dict.keys()]

        # Run the gene expression to enzyme activity converter for this_strain
        enzyme_activity_dict = gene_expression_to_enzyme_activity(gpr, expression_dict)

        # Initialize empty dataframe 
        if this_strain == transcriptomics_data.columns[0]:
            # Use enzyme_activity_dict keys as the index
            enzyme_activity_df = enzyme_activity_df.reindex(enzyme_activity_dict.keys())
            # Add reaction ID column
            enzyme_activity_df['Reaction_ID'] = [k.id for k in enzyme_activity_dict.keys()]
        
        # Add enzymze_activity to dataframe
        enzyme_activity_df[this_strain] = enzyme_activity_dict
    
    return enzyme_activity_df


In [9]:
# Run enzyme activity converter for all strains in transcriptomics_df
all_enzyme_activity_df = convert_transcriptomics_to_enzyme_activity(transcriptomics_df, syn_model)
display(all_enzyme_activity_df)

Unnamed: 0,Reaction_ID,Se_axen_d4_1,Se_axen_d4_2,Se_axen_d4_3,Se_axen_d6_1,Se_axen_d6_2,Se_axen_d6_3,Se_axen_d8_1,Se_axen_d8_2,Se_axen_d8_3
EX_gln__L_e: gln__L_e -->,EX_gln__L_e,,,,,,,,,
EX_hco3_e: hco3_e <=>,EX_hco3_e,,,,,,,,,
EX_mn2_e: mn2_e <=>,EX_mn2_e,,,,,,,,,
EX_arg__L_e: arg__L_e -->,EX_arg__L_e,,,,,,,,,
ADPT: ade_c + prpp_c <=> amp_c + ppi_c,ADPT,inf,inf,inf,inf,inf,inf,inf,inf,inf
...,...,...,...,...,...,...,...,...,...,...
ZDS: h_c + nadph_c + o2_c + zcarote_c --> 2.0 h2o_c + nadp_c + norsp_c,ZDS,inf,inf,inf,inf,inf,inf,inf,inf,inf
RDXRr: nadh_c + 2.0 rdxo_c <=> h_c + nad_c + 2.0 rdxr_c,RDXRr,inf,inf,inf,inf,inf,inf,inf,inf,inf
MPTSS: atp_c + h_c + moadcoo_c --> moadamp_c + ppi_c,MPTSS,inf,inf,inf,inf,inf,inf,inf,inf,inf
NOR: h_c + nadph_c + norsp_c + o2_c --> 2.0 h2o_c + lyc_c + nadp_c,NOR,inf,inf,inf,inf,inf,inf,inf,inf,inf


In [15]:
[r.id for r in syn_model.reactions if not r.gene_reaction_rule]

['EX_gln__L_e',
 'EX_hco3_e',
 'EX_mn2_e',
 'EX_arg__L_e',
 'O2tcx',
 'AOXPBDC',
 'DNTPPA',
 'CBMD',
 'BIOMASS_PIGMENTS',
 'H2Otu_syn',
 'BIOMASS__1',
 'ATPM',
 '5DOAN',
 'Htex',
 'PQH2tum',
 'O2tu',
 'O2tpp',
 'H2Otex',
 'SK_for_c',
 'EX_mg2_e',
 'APRAUR',
 'O2tex',
 'DNMPPA',
 'G5SADs',
 'EX_ptrc_e',
 'BIOMASS_CELL_WALL',
 'RB15BPtcx',
 'GLXO3r',
 'RZ5PP',
 '3PGtcx',
 'BIOMASS_COFACTORS',
 'DM_h2_c',
 'OMCDC',
 'Htcx',
 'H2Otpp',
 'HISTP',
 'EX_ca2_e',
 'CYRDAR',
 'BIOMASS_MEM_LIPIDS',
 'HCO3tcx',
 'BIOMASS_PROTEIN',
 'EX_nh4_e',
 'BIOMASS_RNA',
 'DM_dialurate_c',
 'CO2tex',
 'USHD2',
 'DM_co_c',
 'BIOMASS_CARB',
 'EX_fe2_e',
 'PQH2tcm',
 '2PGLYCtcx',
 'EX_cu2_e',
 'DM_5drib_c',
 'SK_amylose_c',
 'EX_k_e',
 'SK_14glucan_c',
 'SK_glycogen_c',
 'EX_h2o_e',
 'EX_o2_e',
 'EX_co2_e',
 'EX_leu__L_e',
 'EX_cobalt2_e',
 'EX_no3_e',
 'EX_zn2_e',
 'EX_fe3_e',
 'EX_so4_e',
 'EX_spmd_e',
 'EX_mobd_e',
 'EX_ni2_e',
 'EX_na1_e',
 'EX_cynt_e',
 'EX_h_e',
 'FE3tex',
 'FE2tex',
 'NI2tex',
 'MNtex',
 

In [19]:
from fba_utils import prepare_data_for_bmca

metab_fname = "processed_data/metabolomics.csv"
metab_df = pd.read_csv(metab_fname, index_col="Sample")
metab_df_axenic_col_names = [c for c in metab_df.columns if "ax" in c]
rxns_wo_gpr = [r.id for r in syn_model.reactions if not r.gene_reaction_rule]
all_conditions = list(metab_df.columns)
prepared_enzyme_activity_data = prepare_data_for_bmca(all_conditions, all_enzyme_activity_df.set_index('Reaction_ID'), unmapped_variables=rxns_wo_gpr)
prepared_enzyme_activity_data.to_csv('processed_data/prepared_enzyme_activity_data.csv')


In [27]:
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#### Prepare and Export enzyme activity df for use in other processes ####
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 

# ==> Specific to S. elongatus axenic experiment cases

### Add missing columns from metabolomics data 
# Load metabolomics data & get column names
metab_fname = "processed_data/metabolomics.csv"
metab_df = pd.read_csv(metab_fname, index_col="Sample")
metab_df_axenic_col_names = [c for c in metab_df.columns if "ax" in c]

# Create new df by adding new columns for missing column names
# Note: preserve NaN entries for rows of enzyme activity data that are all NaN
full_enzyme_activity_df = pd.DataFrame(columns=metab_df_axenic_col_names)
enz_col_names = all_enzyme_activity_df.columns
for col in metab_df_axenic_col_names:
    if col not in enz_col_names:
        full_enzyme_activity_df[col] = np.inf
    else:
        full_enzyme_activity_df[col] = all_enzyme_activity_df[col].copy()
    for row in all_enzyme_activity_df.index:
        if all(all_enzyme_activity_df.loc[row].isna()):
            full_enzyme_activity_df.loc[row, col] = np.nan

full_enzyme_activity_df.index=all_enzyme_activity_df['Reaction_ID'].copy()

display(full_enzyme_activity_df['Se_axen_d4_1'].dropna().min())
display(full_enzyme_activity_df)
# all_enzyme_activity_df.to_csv("processed_data/all_enzyme_activity.csv")

75.0

Unnamed: 0_level_0,Se_axen_d1_1,Se_axen_d1_2,Se_axen_d1_3,Se_axen_d2_1,Se_axen_d2_2,Se_axen_d2_3,Se_axen_d3_1,Se_axen_d3_2,Se_axen_d3_3,Se_axen_d4_1,...,Se_axen_d6_3,Se_axen_d7_1,Se_axen_d7_2,Se_axen_d7_3,Se_axen_d8_1,Se_axen_d8_2,Se_axen_d8_3,Se_axen_d9_1,Se_axen_d9_2,Se_axen_d9_3
Reaction_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EX_gln__L_e,,,,,,,,,,,...,,inf,inf,inf,,,,inf,inf,inf
EX_hco3_e,,,,,,,,,,,...,,inf,inf,inf,,,,inf,inf,inf
EX_mn2_e,,,,,,,,,,,...,,inf,inf,inf,,,,inf,inf,inf
EX_arg__L_e,,,,,,,,,,,...,,inf,inf,inf,,,,inf,inf,inf
ADPT,,,,,,,,,,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZDS,,,,,,,,,,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
RDXRr,,,,,,,,,,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
MPTSS,,,,,,,,,,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
NOR,,,,,,,,,,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf


In [24]:
### Cleanup enzyme activity df
clean_enzyme_activity_df = all_enzyme_activity_df.copy()

# Replace current index with Reaction_ID column
clean_enzyme_activity_df = clean_enzyme_activity_df.set_index('Reaction_ID')

# Drop rows with all 0s: 
clean_enzyme_activity_df = clean_enzyme_activity_df.loc[~(clean_enzyme_activity_df==0).all(axis=1)]

# Drop rows with all infs:
clean_enzyme_activity_df.replace([np.inf, -np.inf], np.nan, inplace=True)
clean_enzyme_activity_df.dropna(how='all', inplace=True)

clean_enzyme_activity_df



Unnamed: 0_level_0,Se_axen_d4_1,Se_axen_d4_2,Se_axen_d4_3,Se_axen_d6_1,Se_axen_d6_2,Se_axen_d6_3,Se_axen_d8_1,Se_axen_d8_2,Se_axen_d8_3
Reaction_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [35]:
# Write dataframe to csv
clean_enzyme_activity_df.to_csv('processed_data/cleaned_enzymze_activity_for_syn_elong_model.csv')

clean_enzyme_activity_df.iloc[0:25]

Unnamed: 0_level_0,Se_axen_d4_1,Se_axen_d4_2,Se_axen_d4_3,Se_axen_d6_1,Se_axen_d6_2,Se_axen_d6_3,Se_axen_d8_1,Se_axen_d8_2,Se_axen_d8_3
Reaction_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DTMPK,1810.0,2030.0,1568.0,2020.0,1758.0,1736.0,946.0,1376.0,1598.0
ACOATA,4691.0,5294.0,3826.0,4813.0,4716.0,4054.0,1790.0,3524.0,3105.0
3HAD160,1636.0,1826.0,1332.0,1591.0,1476.0,999.0,711.0,1132.0,1436.0
RNDR3,28195.0,26841.0,22287.0,19726.0,17817.0,15050.0,12523.0,10904.0,9458.0
G5SD,20578.0,23123.0,19829.0,25519.0,20047.0,20535.0,12402.0,16955.0,19772.0
HISTDa,22946.0,25799.0,22763.0,29749.0,24686.0,26093.0,15009.0,23037.0,23684.0
TRPS3,3922.0,4500.0,3382.0,4196.0,3584.0,3638.0,2308.0,3407.0,3780.0
DMBZIDS2,11214.0,12125.0,10260.0,11519.0,10167.0,9387.0,5912.0,8199.0,7513.0
HSTPT,17080.0,17425.0,15619.0,17314.0,14256.0,13424.0,8566.0,11774.0,11645.0
ZCAROTDH2,7909.0,8136.0,6940.0,7471.0,7304.0,6960.0,3520.0,5678.0,4576.0


 ### 3. For each condition, normalize enzyme data with respect to the reference strain

In [15]:
# Create dataframe of normalized enzyme activity by scaling realtive to reference strain
enzymze_activity_df = all_enzyme_activity_df.copy()

rep_names = transcriptomics_df.columns
normalized_enzyme_activity_df = enzymze_activity_df[rep_names].div(enzymze_activity_df[ref_rep], axis=0)


# TODO: should these replacements be the original values (0 and inf), or 1s (to indicate same as reference strain)?
# ==> NEITHER: drop rows with all 0 or inf or NaN entries

# Ensure 0-valued or inf entries in enzymze_activity_df are passed into normalized_enzyme_activity_df
# - OR - 
# Ensure 0-valued or inf entries in enzymze_activity_df are passed into normalized_enzyme_activity_df
for col in rep_names:
    for row in enzymze_activity_df.index:
        if (enzymze_activity_df[col][row] == 0.0) or (np.isinf(enzymze_activity_df[col][row])):
            # this_row = enzymze_activity_df['Reaction_ID'][row]
            # normalized_enzyme_activity_df.loc[this_row, col] = enzymze_activity_df[col][row]
            normalized_enzyme_activity_df.loc[row, col] = enzymze_activity_df[col][row]

# Re-index normalized_enzyme_activity_df using reaction-IDs
normalized_enzyme_activity_df['Reaction_ID'] = enzymze_activity_df['Reaction_ID']
normalized_enzyme_activity_df = normalized_enzyme_activity_df.set_index('Reaction_ID')

normalized_enzyme_activity_df.iloc[0:25]

# TODO: add "check_data" function to check validity before passing to pymc

Unnamed: 0_level_0,Se_axen_d4_1,Se_axen_d4_2,Se_axen_d4_3,Se_axen_d6_1,Se_axen_d6_2,Se_axen_d6_3,Se_axen_d8_1,Se_axen_d8_2,Se_axen_d8_3
Reaction_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
EX_gln__L_e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EX_hco3_e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EX_mn2_e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EX_arg__L_e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ADPT,inf,inf,inf,inf,inf,inf,inf,inf,inf
O2tcx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AOXPBDC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DNTPPA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CBMD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BIOMASS_PIGMENTS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
missing_rxns = {r: (r in FVA_object.index) for r in normalized_enzyme_activity_df.index}
{k:v for k,v in missing_rxns.items() if not v}

{'BIOMASS__1': False, 'EX_co2_e': False, 'EX_sucr_e': False}

# ----- STOP HERE ..... for now

### 4. For each condition, multiply the reference bounds by the normalized enzyme data.

In [98]:
# Get relative scaled rates of change in metabolic abundance
normalized_rates_df = rates_df.div(rates_df[ref_rep], axis=0)

In [89]:
# TODO: make this into a function
flux_bounds = {}
for this_strain in normalized_enzyme_activity_df.columns:
    flux_bounds[this_strain] = FVA_object
    for this_rxn in normalized_enzyme_activity_df.index:
        if this_rxn in FVA_object:
            this_factor = normalized_enzyme_activity_df[this_strain][this_rxn]
            flux_bounds[this_strain].loc[this_rxn] = this_factor*FVA_object[this_strain][this_rxn]
        elif this_rxn in missing_rxns:
            # TODO: for sucrose production ('EX_sucr_e'), use normalized calcualted rate from normalized_rates_df
            if rxn=='EX_sucr_e':
                # flux_bounds[this_strain].loc[this_rxn] = pd.Series({'minimum': ....,
                #                                                     'maximum': ....})
                

test_df = flux_bounds['Se_axen_d4_1'].iloc[0:35]
test_df.columns = ['d4_1_min', 'd4_1_max']
test_df.join(flux_bounds[ref_rep].iloc[0:35])

Unnamed: 0,d4_1_min,d4_1_max,minimum,maximum
EX_gln__L_e,0.0,0.0,0.0,0.0
EX_hco3_e,-1.99,-1.99,-1.99,-1.99
EX_mn2_e,0.0,0.0,0.0,0.0
EX_arg__L_e,0.0,0.0,0.0,0.0
ADPT,1.032064e-05,1.032064e-05,1.032064e-05,1.032064e-05
O2tcx,0.02631984,0.02631984,0.02631984,0.02631984
AOXPBDC,1.411151e-06,1.411151e-06,1.411151e-06,1.411151e-06
DNTPPA,4.111581e-06,4.111581e-06,4.111581e-06,4.111581e-06
CBMD,0.0,0.0,0.0,0.0
BIOMASS_PIGMENTS,0.001061867,0.001061867,0.001061867,0.001061867


In [82]:
FVA_object.iloc[0]

minimum    0.0
maximum    0.0
Name: EX_gln__L_e, dtype: float64

### 5. Run FBA using the condition-specific bounds to compute the condition-specific fluxes