## This notebook handles the pre-processing needed to modify thereaction bounds in a cobra model to facilitate running Eflux2

In [1]:
import sys
sys.path.append('../src')
sys.path.append('/Users/mahs128/Repos/CONCERTO')
from concerto.helpers.load_model_from_git import load_model_from_git
from eflux2 import EFlux2
import cobra
import pandas as pd
import numpy as np
import gurobipy

### 1. Run flux variability analysis on the reference strain to get (reasonably) tight bounds on the fluxes.
 


In [2]:
# Load SBML model
syn_model = cobra.io.read_sbml_model('../models/iJB785_w_sucrose_transport.xml')

Set parameter TokenServer to value "leghorn.emsl.pnl.gov"


In [32]:
print(syn_model.objective)

Maximize
1.0*BIOMASS__1 - 1.0*BIOMASS__1_reverse_063c7


In [3]:
# Incorporate reference strain information into the model

# Get metabolic rates
rates_fname = "processed_data/cleaned_metabolomic_abundance_rates.csv"
rates_df = pd.read_csv(rates_fname, index_col="Sample")

# Get transcriptomics
transcriptomics_fname = "processed_data/cleaned_transcriptomics.csv"
transcriptomics_df = pd.read_csv(transcriptomics_fname, index_col="Label")

# Define reference strain
ref_rep = transcriptomics_df.columns[4] # 'Se_axen_d6_2'
ref_rep

'Se_axen_d6_2'

In [4]:
# Run optimization on cobra model to get fluxes for reactions of interest
# sucrose production ('EX_sucr_e') and CO2 uptake ('EX_co2_e') and biomass ('BIOMASS__1')
opt_df = syn_model.optimize().to_frame()
display(opt_df)
display(opt_df.loc['EX_sucr_e'])
display(opt_df.loc['EX_co2_e'])
display(opt_df.loc['BIOMASS__1'])

Unnamed: 0,fluxes,reduced_costs
EX_gln__L_e,0.00000,0.000000e+00
EX_hco3_e,-1.99000,0.000000e+00
EX_mn2_e,0.00000,0.000000e+00
EX_arg__L_e,0.00000,0.000000e+00
ADPT,0.00001,-3.108624e-14
...,...,...
MDDEP3pp,0.00000,0.000000e+00
MDDEP4pp,0.00000,0.000000e+00
MNHNAtpp,0.00000,0.000000e+00
EX_sucr_e,0.01100,0.000000e+00


fluxes           0.011
reduced_costs    0.000
Name: EX_sucr_e, dtype: float64

fluxes          -0.316958
reduced_costs    0.000000
Name: EX_co2_e, dtype: float64

fluxes           5.390187e-02
reduced_costs   -8.526513e-14
Name: BIOMASS__1, dtype: float64

In [13]:
# Specify upper/lower bounds of model for excluded reactions
for rxn in ['EX_sucr_e', 'EX_co2_e', 'BIOMASS__1']:
   syn_model.reactions.get_by_id(rxn).lower_bound = opt_df.loc[rxn, 'fluxes']

   print(rxn, ": ", syn_model.reactions.get_by_id(rxn).lower_bound, ";  ", syn_model.reactions.get_by_id(rxn).upper_bound)

EX_sucr_e :  0.011 ;   1000.0
EX_co2_e :  -0.3169581181768563 ;   1000.0
BIOMASS__1 :  0.05390186774577897 ;   2.0


In [None]:
# Create reactions list exclude reactions of interest: sucrose production ('EX_sucr_e') and CO2 uptake ('EX_co2_e') and biomass ('BIOMASS__1') 
rxn_list = [r.id for r in syn_model.reactions if (r.id not in ['EX_sucr_e', 'EX_co2_e', 'BIOMASS__1'])]

In [17]:
# Compared optimized fluxes to calculated rates of observed metabolic abundances
rates_df.loc['sucr'][ref_rep]

1327111.5

In [7]:
# Run FVA (~30-40 seconds)
FVA_object = cobra.flux_analysis.flux_variability_analysis(model=syn_model, reaction_list=rxn_list, 
                                                           fraction_of_optimum=0.85, processes=8)

Set parameter TokenServer to value "leghorn.emsl.pnl.gov"
Read LP format model from file /var/folders/k9/b8pxky2572sdtgy2vnxhcljw0000gn/T/tmpmyv46l2d.lp
Reading time = 0.00 seconds
: 770 rows, 1703 columns, 7655 nonzeros
Set parameter TokenServer to value "leghorn.emsl.pnl.gov"
Read LP format model from file /var/folders/k9/b8pxky2572sdtgy2vnxhcljw0000gn/T/tmptturazn8.lp
Reading time = 0.00 seconds
: 770 rows, 1703 columns, 7655 nonzeros
Set parameter TokenServer to value "leghorn.emsl.pnl.gov"
Read LP format model from file /var/folders/k9/b8pxky2572sdtgy2vnxhcljw0000gn/T/tmpvpvtw8b6.lp
Reading time = 0.00 seconds
: 770 rows, 1703 columns, 7655 nonzeros
Set parameter TokenServer to value "leghorn.emsl.pnl.gov"
Read LP format model from file /var/folders/k9/b8pxky2572sdtgy2vnxhcljw0000gn/T/tmp217ttvo_.lp
Reading time = 0.00 seconds
: 770 rows, 1703 columns, 7655 nonzeros
Set parameter TokenServer to value "leghorn.emsl.pnl.gov"
Read LP format model from file /var/folders/k9/b8pxky2572s

In [8]:
FVA_object

Unnamed: 0,minimum,maximum
EX_gln__L_e,0.00000,0.000000
EX_hco3_e,-1.99000,-1.989283
EX_mn2_e,0.00000,0.000000
EX_arg__L_e,0.00000,0.000000
ADPT,0.00001,0.000010
...,...,...
MDDEP2pp,0.00000,0.000000
MDDEP3pp,0.00000,71.169417
MDDEP4pp,0.00000,47.446278
MNHNAtpp,0.00000,388.340298


 ### 2. Next, convert the transcriptomics data to enzyme data (Take the min of subunits, take the sum of isozymes)

In [5]:
# Load transciptomics data
transcriptomics_fname = "processed_data/cleaned_transcriptomics.csv"
transcriptomics_df = pd.read_csv(transcriptomics_fname, index_col="Label")
transcriptomics_df.head()

Unnamed: 0_level_0,Se_axen_d4_1,Se_axen_d4_2,Se_axen_d4_3,Se_axen_d6_1,Se_axen_d6_2,Se_axen_d6_3,Se_axen_d8_1,Se_axen_d8_2,Se_axen_d8_3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SYNPCC7942_RS00005,16290,17761,15101,14567,13967,12653,7016,8855,8689
SYNPCC7942_RS00010,7185,7502,6565,10086,7546,7705,3747,6670,6960
SYNPCC7942_RS00015,24176,26304,22781,23926,21306,20412,13440,17253,17053
SYNPCC7942_RS00020,35119,35145,25895,37701,34175,30569,24835,39769,30280
SYNPCC7942_RS00025,6891,7803,6607,6347,6284,6844,3326,4146,3180


In [7]:
# Set reference strain
ref_rep = transcriptomics_df.columns[4] # 'Se_axen_d6_2'
ref_rep

'Se_axen_d6_2'

In [8]:
# Normalize transcriptomics data w.r.t. reference strain
normalized_transcriptomics_df = transcriptomics_df.div(transcriptomics_df[ref_rep], axis=0)
normalized_transcriptomics_df.head()

Unnamed: 0_level_0,Se_axen_d4_1,Se_axen_d4_2,Se_axen_d4_3,Se_axen_d6_1,Se_axen_d6_2,Se_axen_d6_3,Se_axen_d8_1,Se_axen_d8_2,Se_axen_d8_3
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SYNPCC7942_RS00005,1.166321,1.27164,1.081191,1.042958,1.0,0.905921,0.502327,0.633994,0.622109
SYNPCC7942_RS00010,0.95216,0.994169,0.869997,1.336602,1.0,1.021071,0.496554,0.883912,0.922343
SYNPCC7942_RS00015,1.134704,1.234582,1.069229,1.12297,1.0,0.95804,0.630808,0.809772,0.800385
SYNPCC7942_RS00020,1.027623,1.028383,0.757718,1.103175,1.0,0.894484,0.726701,1.163687,0.886028
SYNPCC7942_RS00025,1.096595,1.241725,1.0514,1.010025,1.0,1.089115,0.529281,0.659771,0.506047


In [47]:
# Snippet of code borrowed from eflux2.py to populate the first inpur argument of the gene_expression_to_enzyme_activity() function

# Parse GPR into a dict containing isozymes (separated by 'or')
# Each isozyme has a set of subunits (separated by 'and')
gpr_dict = dict()
for r in syn_model.reactions:
    if r.gene_reaction_rule:
        temp = set()
        for x in [x.strip('() ') for x in r.gene_reaction_rule.split(' or ')]:
            temp.add(frozenset(y.strip('() ') for y in x.split(' and ')))
        gpr_dict[r] = temp

gpr_dict

{<Reaction ADPT at 0x2b3b99f90>: {frozenset({'Synpcc7942_2454'})},
 <Reaction DTMPK at 0x2b3ba3390>: {frozenset({'SYNPCC7942_RS00470'})},
 <Reaction ORNTA at 0x2b3b9b950>: {frozenset({'Synpcc7942_0034'})},
 <Reaction ACOATA at 0x2b3badd10>: {frozenset({'SYNPCC7942_RS07445'})},
 <Reaction 3HAD160 at 0x2b3ba6a50>: {frozenset({'SYNPCC7942_RS04785'})},
 <Reaction RNDR3 at 0x2b3bb9190>: {frozenset({'SYNPCC7942_RS08210'})},
 <Reaction G5SD at 0x2b3ba56d0>: {frozenset({'SYNPCC7942_RS11385'}),
  frozenset({'SYNPCC7942_RS11510'})},
 <Reaction HISTDa at 0x2b3bbbdd0>: {frozenset({'SYNPCC7942_RS07765'})},
 <Reaction TRPS3 at 0x2b3bc6590>: {frozenset({'SYNPCC7942_RS02095',
             'SYNPCC7942_RS10880'})},
 <Reaction DMBZIDS2 at 0x2b3bb9290>: {frozenset({'SYNPCC7942_RS08460'}),
  frozenset({'SYNPCC7942_RS10185'})},
 <Reaction HSTPT at 0x2b3bc6550>: {frozenset({'SYNPCC7942_RS05290'})},
 <Reaction ZCAROTDH2 at 0x2b3bc7710>: {frozenset({'SYNPCC7942_RS06385',
             'SYNPCC7942_RS07730'})},
 

In [112]:
# Check which genes (listed in observed transcriptomics data) are missing from the cobra model
# list_of_gprs_in_model = [str(syn_model.reactions.get_by_id(r.id).gpr).split(" and ") for r in syn_model.reactions]
list_of_gprs_in_model = [g.id for g in syn_model.genes]

genes_not_in_model = [g for g in transcriptomics_df.index if g not in list_of_gprs_in_model]
genes_not_in_data = [g for g in list_of_gprs_in_model if g not in transcriptomics_df.index]
genes_in_model_and_data = [g for g in list_of_gprs_in_model if g in transcriptomics_df.index]


print('# of genes in model:', len(list_of_gprs_in_model))
print('# of genes in data:', len(transcriptomics_df.index))
print('# of genes_not_in_model:', len(genes_not_in_model))
print('# of genes_not_in_data:', len(genes_not_in_data))
print('# of genes_in_model_and_data:', len(genes_in_model_and_data))

print('# of reactions in model:', len([r.id for r in syn_model.reactions]))

genes_not_in_data

# of genes in model: 786
# of genes in data: 2761
# of genes_not_in_model: 1986
# of genes_not_in_data: 11
# of genes_in_model_and_data: 775
# of reactions in model: 851


['Synpcc7942_2454',
 'Synpcc7942_0034',
 'Synpcc7942_0413',
 'Synpcc7942_1971',
 'Synpcc7942_2107',
 'Synpcc7942_2010',
 'SYNPCC7942_RS05000',
 'Synpcc7942_1478',
 'Synpcc7942_0491',
 'Synpcc7942_0668',
 'ATCC_700927']

In [147]:
# Code shared by Jeremy in Teams chat...
from cobra import Reaction, Gene

def gene_expression_to_enzyme_activity(gpr: dict[Reaction, list[list[Gene]]], expression: dict[Gene, float]):
    """Map gene expression to enzyme activity
    inputs:
        gpr: dictionary of reactions (keys) to list of list of genes (values) for the correpsonding gene reaction rule
        expression: dictionary of gene names (keys) to values from [likely] observed transcriptomics data
    outputs:
        enzyme_activity: dictionary of reactions (keys) to corresponding isozyme activity from observed data (value)
    """
    
    enzyme_activity = {}
    for rxn in syn_model.reactions:
      enzyme_activity[rxn] = 0.0

      if rxn in gpr: # temporary fix: ensure rxn has a gene_reaction_rule defined

        for isozyme in gpr[rxn]:
          isozyme_activity = np.inf
          for gene in isozyme:

            if gene in expression: # temporary fix: ensure gene is included in observed data

              # Old code
              # isozyme_activity = np.min([enzyme_activity, expression[gene]])
              # enzyme_activity[rxn] += isozyme_activity

              # Correction to old code
              isozyme_activity = np.min([isozyme_activity, expression[gene]])
          enzyme_activity[rxn] += isozyme_activity
    return enzyme_activity

In [148]:
# Create dict of genes and corresponding float values using trancsciptomics data
this_strain = ref_rep # use reference strain

expression_dict = {g: transcriptomics_df.loc[g][this_strain] for g in transcriptomics_df.index}
expr_dict_keys = [kz for kz in expression_dict.keys()]
{k: expression_dict[k] for k in expr_dict_keys[0:5]}

{'SYNPCC7942_RS00005': 13967,
 'SYNPCC7942_RS00010': 7546,
 'SYNPCC7942_RS00015': 21306,
 'SYNPCC7942_RS00020': 34175,
 'SYNPCC7942_RS00025': 6284}

In [149]:
# Run the gene expression to enzyme activity converter
enzymze_activity_dict = gene_expression_to_enzyme_activity(gpr_dict, expression_dict)
enzymze_activity_dict

{<Reaction EX_gln__L_e at 0x2b3b98410>: 0.0,
 <Reaction EX_hco3_e at 0x2b3b983d0>: 0.0,
 <Reaction EX_mn2_e at 0x2b3b98ed0>: 0.0,
 <Reaction EX_arg__L_e at 0x2b3b99810>: 0.0,
 <Reaction ADPT at 0x2b3b99f90>: inf,
 <Reaction O2tcx at 0x2b3b9a750>: 0.0,
 <Reaction AOXPBDC at 0x2b3b9b9d0>: 0.0,
 <Reaction DNTPPA at 0x2b3b9b910>: 0.0,
 <Reaction CBMD at 0x2b3ba0890>: 0.0,
 <Reaction BIOMASS_PIGMENTS at 0x2b3ba1590>: 0.0,
 <Reaction H2Otu_syn at 0x2b3ba23d0>: 0.0,
 <Reaction DTMPK at 0x2b3ba3390>: 1758.0,
 <Reaction BIOMASS__1 at 0x2b3ba5a10>: 0.0,
 <Reaction ORNTA at 0x2b3b9b950>: inf,
 <Reaction ATPM at 0x2b3ba6950>: 0.0,
 <Reaction ACOATA at 0x2b3badd10>: 4716.0,
 <Reaction 5DOAN at 0x2b3badcd0>: 0.0,
 <Reaction 3HAD160 at 0x2b3ba6a50>: 1476.0,
 <Reaction RNDR3 at 0x2b3bb9190>: 17817.0,
 <Reaction Htex at 0x2b3b9ba90>: 0.0,
 <Reaction G5SD at 0x2b3ba56d0>: 20047.0,
 <Reaction HISTDa at 0x2b3bbbdd0>: 24686.0,
 <Reaction TRPS3 at 0x2b3bc6590>: 3584.0,
 <Reaction DMBZIDS2 at 0x2b3bb9290>: 1

 ### 3. For each condition, normalize enzyme data with respect to the reference strain

### 4. For each condition, multiply the reference bounds by the normalized enzyme data.

### 5. Run FBA using the condition-specific bounds to compute the condition-specific fluxes