# Using thermodynamic analysis to guide metabolic engineering
DO 12-6-2018  
Using Python 3 and eQuilibrator API  


In [1]:
#Changing directory to the github home folder /Ctherm_thermo
import os
os.chdir('/Users/satyakam/Dropbox/work/equilibrator-api-master')
import sys
sys.path.append('/Users/satyakam/Dropbox/work/sbtab-0.9.64')
#sys.path.append('/Users/satyakam/Dropbox/work/equilibrator-api-master')
#sys.path.append('/Users/satyakam/Dropbox/work/equilibrator-api-master/src/equilibrator_api')
sys.path.append('/Users/satyakam/Dropbox/work/equilibrator-api-master/src')
#sys.path.append('/Users/satyakam/Dropbox/work')


In [2]:
import numpy as np
from numpy import array, eye, log, zeros, matrix
from numpy.linalg import inv, solve
import pandas as pd
from equilibrator_api import Reaction, ComponentContribution, ReactionMatcher, CompoundMatcher, ParseError, Pathway
from equilibrator_api.bounds import Bounds
%matplotlib inline

## Set up translator for KEGG IDs
Note: I set these up as dataframes because I was troubleshooting an issue with duplicate KEGG IDs. Eventually I think these should be set up as dictionaries, to make the code more readable

In [3]:
# make a dictionary to translate KEGG IDs into human-readable abbreviations
keggTranslatorDf = pd.read_excel('KEGG_SEED_DO.xls')
kt = keggTranslatorDf #short name for easier typing

# translate KEGG ID to long name
ktn = kt.loc[:,['KEGG ID(S)', 'PRIMARY NAME']]
ktn['KEGG ID(S)'] = ktn['KEGG ID(S)'].str.lower() # set to lower case for better matching
ktn.set_index('KEGG ID(S)', inplace = True)

# translate long name to KEGG ID
# the original dictionaries sometimes had trouble with duplicate KEGG IDs. If there are duplicates, make sure to choose the lowest number
ntk = kt.loc[:,['PRIMARY NAME', 'KEGG ID(S)']].sort_values(by = ['KEGG ID(S)'], ascending = True)
ntk['PRIMARY NAME'] = ntk['PRIMARY NAME'].str.lower() # set to lower case for better matching
ntk = ntk.groupby('PRIMARY NAME').first() # take the first KEGG ID in each group

# translate KEGG ID to abbreviation
kta = kt.loc[:,['KEGG ID(S)', 'ABBREVIATION']]
kta['KEGG ID(S)'] = kta['KEGG ID(S)'].str.lower() # set to lower case for better matching
kta.set_index('KEGG ID(S)', inplace = True)

# translate abbreviation to KEGG ID
atk = kt.loc[:,['ABBREVIATION', 'KEGG ID(S)']]
atk['ABBREVIATION'] = atk['ABBREVIATION'].str.lower() # set to lower case for better matching
atk.set_index('ABBREVIATION', inplace = True)

atkDict = dict(zip(atk.index, atk['KEGG ID(S)'].str.upper()))

## Set up model
* Choose reactions
* Set fluxes
* Set concentration bounds
* Set pH and ionic strength

In [5]:
os.chdir('/Users/satyakam/Dropbox/work/component_contribution_ctherm')

In [16]:
allRxnDf = pd.read_excel('cth_thermo_model_DO_SD5.xlsx', sheet_name = 'reactions')
allRxnDf[:5]

Unnamed: 0,Name,PlaintextFormula,AbbreviationFormula,KeggFormula,Notes,ppi-pfk,atp-pfk,mal-snt,pdc,gapn,pyk,aldh-ndp,adh-ndp,mal-gapn,tsac
0,ATPase1,ATP + H2O <=> ADP + Pi,h2o + atp <=> adp + pi,C00002 + C00001 <=> C00008 + C00009,ATP hydrolysis for cellobiose transport,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,CBP,Phosphate + Cellobiose <=> D-Glucose + Glucose...,pi + cellb <=> glc-D + g1p,C00009 + C00185 <=> C00031 + C00103,,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,BGL,Cellobiose + H2O <=> 2 D-Glucose,cellb + h2o <=>2 glc-D,C00185 + C00001 <=> 2 C00031,Beta glucosidase,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,GLK-GTP,D-Glucose + GTP <=> D-Glucose-6-phosphate + GDP,glc-D + gtp <=> g6p + gdp,C00031 + C00044 <=> C00092 + C00035,,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,GLK-ATP,D-Glucose + ATP <=> D-Glucose-6-phosphate + ADP,glc-D + atp <=> g6p + adp,C00031 + C00002 <=> C00092 + C00008,,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [17]:
# choose a flux set, and drop all of the zero-flux reactions
fluxSet = 'ppi-pfk'
selectedRxnDf = allRxnDf.loc[allRxnDf[fluxSet] != 0, ['Name', 'AbbreviationFormula', 'KeggFormula', fluxSet]]
selectedRxnDf.rename(columns = {fluxSet:'flux'}, inplace = True) # rename the flux columns to 'flux' to simplify subsequent                                                        # processing
selectedRxnDf

Unnamed: 0,Name,AbbreviationFormula,KeggFormula,flux
0,ATPase1,h2o + atp <=> adp + pi,C00002 + C00001 <=> C00008 + C00009,2
1,CBP,pi + cellb <=> glc-D + g1p,C00009 + C00185 <=> C00031 + C00103,1
3,GLK-GTP,glc-D + gtp <=> g6p + gdp,C00031 + C00044 <=> C00092 + C00035,1
5,PGMT,g1p <=> g6p,C00103 <=> C00092,1
6,PGI,g6p <=> f6p,C00092 <=> C00085,2
7,PFK-PPi,ppi + f6p <=> pi + fdp + h,C00013 + C00085 <=> C00009 + C00354 + C00080,2
9,FBA,fdp <=> dhap + g3p,C00354 <=> C00111 + C00118,2
10,TPI,dhap <=> g3p,C00111 <=> C00118,2
11,GAPDH,pi + nad + g3p <=> nadh + 13dpg,C00009 + C00003 + C00118 <=> C00004 + C00236,4
14,PGK-GTP,gdp + 13dpg <=> gtp + 3pg,C00035 + C00236 <=> C00044 + C00197,4


In [18]:
#Import metabolite data from excel file
Met_bound = pd.read_excel('cth_thermo_model_DO_SD5.xlsx', sheet_name = 'metabolite_bounds')
Alldata = pd.read_excel('./metabolite_data/lt_dataset2_quantified.xlsx')
Met_data = Alldata.loc[Alldata['Replicate'] != 0, ['Timepoint', 'KEGG_ID', 'compound','amount', 'Sample','Replicate','Maven filename']]
Met_data

Unnamed: 0,Timepoint,KEGG_ID,compound,amount,Sample,Replicate,Maven filename
0,2,C00197,3PG,3.400159,A,1,A1_1
1,2,C00197,3PG,2.143403,A,2,A1_2
2,5,C00197,3PG,0.351406,A,1,A2_1
3,5,C00197,3PG,0.504024,A,2,A2_2
4,9,C00197,3PG,0.273058,A,1,A3_1
5,9,C00197,3PG,0.354307,A,2,A3_2
6,2,C00197,3PG,2.850280,B,1,B1_1
7,2,C00197,3PG,3.762796,B,2,B1_2
8,5,C00197,3PG,2.994842,B,1,B2_1
9,5,C00197,3PG,3.664070,B,2,B2_2


In [23]:
Met_bound

Unnamed: 0,KEGG_ID,Name,Concentration:Max,Concentration:Min,Type,GetFromMeasured,Unnamed: 6,max (µM),min (µM)
0,C00008,adp,0.0001,0.0001,Cofactor,no,,100.0,100.0
1,C00020,amp,0.01,0.0001,Cofactor,no,,10000.0,100.0
2,C00002,atp,0.02,8e-05,Cofactor,no,,20000.0,80.0
3,C00010,coa,0.0001,0.0001,Cofactor,maybe,,100.0,100.0
4,C00139,fdxox,0.0001,0.0001,Cofactor,,,100.0,100.0
5,C00138,fdxrd,0.01,1e-06,Cofactor,,,10000.0,1.0
6,C00035,gdp,0.0001,0.0001,Cofactor,no,,100.0,100.0
7,C00044,gtp,0.02,8e-05,Cofactor,no,,20000.0,80.0
8,C00003,nad,0.0001,0.0001,Cofactor,no,,100.0,100.0
9,C00004,nadh,0.01,1e-06,Cofactor,no,,10000.0,1.0


In [19]:
# parse the KeggFormula column to create eQuilibrator Reaction objects
# create reaction list
reactions = []
for i, row in selectedRxnDf.iterrows():
    rxn = Reaction.parse_formula(row['KeggFormula'], rid = row['Name'])
    if (rxn.check_full_reaction_balancing()):
        reactions.append(rxn)
    else:
        print('Error: reaction {} is not balanced'.format(row['AbbreviationFormula']))
        
# create flux list
fluxes = selectedRxnDf['flux'].values

# need to declare pH and ionic strength constants at the beginning
PH = 7.0
IS = 0.1

# calculate dGO_r_primes
dG0_r_primes = []
for r in reactions:
    result = r.dG0_prime(pH = PH, ionic_strength = IS)
    #print(result)
    dG0_r_primes.append(result)

In [20]:
# set up the model
pp = Pathway(reactions = reactions, fluxes = fluxes, dG0_r_primes = dG0_r_primes)

In [21]:
# set up the reaction bounds
# make a dataframe with the upper and lower bounds
def bounds_to_df(compounds, bounds):
    """
    Make a dataframe with concentrations
    Args:
        compounds = list of KEGG IDs
        bounds = equilibrator-api bounds object
    """
    boundsDf = pd.DataFrame(compounds, columns = ['Compound:Identifiers:kegg.compound']) # I had to choose this clunky column name to make it work with bounds.py 
                                                                                                    # from_dataframe method
    boundsDf['Name'] = boundsDf['Compound:Identifiers:kegg.compound'].apply(lambda x: kta.loc[x.lower()]) # add abbreviations
    boundsDf['Concentration:Max'] = boundsDf['Compound:Identifiers:kegg.compound'].apply(lambda x: bounds.GetUpperBound(x))
    boundsDf['Concentration:Min'] = boundsDf['Compound:Identifiers:kegg.compound'].apply(lambda x: bounds.GetLowerBound(x))
    return boundsDf

In [None]:
Uf = 1 #Uncertainity factor 
for sm in range(12): # divide the samples into datasets
    if sm < 7:
        smple = 'A'
    else:
        smple = 'B'
    if (sm)%3==0:
        tym = 2
    elif (sm)%3==1:
        tym = 5
    else:
        tym = 9
    if sm<3 or (sm<9 and sm>5):
        rep = 1
    else:
        rep = 2
    for i, cpd in Met_bound.iterrows():  # reset bounds at the start of each iteration
        pp.bounds.SetBounds(cpd['KEGG_ID'],cpd['Concentration:Min'],cpd['Concentration:Max'] )
    ref_conc = {}
    for i, dat in Met_data.iterrows():
        if (dat['Sample']==smple and dat['Timepoint']==tym and dat['Replicate']==rep):
            ref_conc.update({dat['KEGG_ID']: dat['amount']/1e6 })   
    for cpd2, conc2 in ref_conc.items(): # this loop index represents the fompound to be excluded from constraint list
        for cpd, conc in ref_conc.items(): # this loop iterates over compound list to genrate bounds for mdf simulation
            if cpd!=cpd2 : 
                pp.bounds.SetBounds(cpd, conc / Uf,conc * Uf)          
        mdf_data = pp.calc_mdf()
        print(smple,tym,rep,mdf_data.mdf)
        if mdf_data.mdf>0 :
            mdf_data, con = pp.conc_mdf()
            print(cpd2,tym,mdf_data.mdf)
            k=0
            for c in mdf_data.compound_data:
                print(c.compound, mdf_data.model.concentration_bounds.GetBoundTuple(c.compound),con[k])
                k=k+1

In [None]:
Uf = 1 #Uncertainity factor 
for sm in range(12): # divide the samples into datasets
    if sm < 7:
        smple = 'A'
    else:
        smple = 'B'
    if (sm)%3==0:
        tym = 2
    elif (sm)%3==1:
        tym = 5
    else:
        tym = 9
    if sm<3 or (sm<9 and sm>5):
        rep = 1
    else:
        rep = 2
    for i, cpd in Met_bound.iterrows():  # reset bounds at the start of each iteration
        pp.bounds.SetBounds(cpd['KEGG_ID'],cpd['Concentration:Min'],cpd['Concentration:Max'] )
    ref_conc = {}
    for i, dat in Met_data.iterrows():
        if (dat['Sample']==smple and dat['Timepoint']==tym and dat['Replicate']==rep):
            ref_conc.update({dat['KEGG_ID']: dat['amount']/1e6 })   
    for cpd2, conc2 in ref_conc.items(): # this loop index represents the fompound to be excluded from constraint list
        for cpd, conc in ref_conc.items(): # this loop iterates over compound list to genrate bounds for mdf simulation
            if cpd!=cpd2 : 
                pp.bounds.SetBounds(cpd, conc / Uf,conc * Uf)          
        mdf_data = pp.calc_mdf()
        print(smple,tym,rep,mdf_data.mdf)
        if mdf_data.mdf>0 :
            mdf_data, con = pp.conc_mdf()
            print(cpd2,tym,mdf_data.mdf)
            k=0
            for c in mdf_data.compound_data:
                print(c.compound, mdf_data.model.concentration_bounds.GetBoundTuple(c.compound),con[k])
                k=k+1


A 9 1 -8.79541690265577
A 9 1 -8.795416902655774
A 9 1 -8.795416902655774
A 9 1 -8.795416902655774
A 9 1 -8.795416902655774
A 9 1 -8.795416902655774
A 9 1 -8.795416902655774
A 9 1 -8.795416902655774
A 9 1 -8.795416902655774
A 9 1 -8.795416902655777
A 9 1 -8.795416902655774
A 9 1 -8.795416902655774
A 9 1 -8.795416902655777
A 9 1 -8.795416902655777
A 9 1 -8.795416902655777
A 9 1 -8.795416902655777
A 9 1 -8.795416902655774
A 9 1 -8.795416902655774
A 9 1 -8.795416902655777
A 9 1 -8.795416902655777
A 9 1 -8.795416902655777
A 9 1 -8.795416902655777
A 2 1 -13.471207241309024
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13.471207241309031
A 2 1 -13