# Using thermodynamic analysis to guide metabolic engineering
DO 12-6-2018  
Using Python 3 and eQuilibrator API  


In [1]:
#Changing directory to the github home folder /Ctherm_thermo
import os
os.chdir('/Users/satyakam/Dropbox/work/equilibrator-api-master')
import sys
sys.path.append('/Users/satyakam/Dropbox/work/sbtab-0.9.64')
#sys.path.append('/Users/satyakam/Dropbox/work/equilibrator-api-master')
#sys.path.append('/Users/satyakam/Dropbox/work/equilibrator-api-master/src/equilibrator_api')
sys.path.append('/Users/satyakam/Dropbox/work/equilibrator-api-master/src')
#sys.path.append('/Users/satyakam/Dropbox/work')


In [2]:
import numpy as np
from numpy import array, eye, log, zeros, matrix
from numpy.linalg import inv, solve
import pandas as pd
from equilibrator_api import Reaction, ComponentContribution, ReactionMatcher, CompoundMatcher, ParseError, Pathway
from equilibrator_api.bounds import Bounds
%matplotlib inline

## Set up translator for KEGG IDs
Note: I set these up as dataframes because I was troubleshooting an issue with duplicate KEGG IDs. Eventually I think these should be set up as dictionaries, to make the code more readable

In [3]:
# make a dictionary to translate KEGG IDs into human-readable abbreviations
keggTranslatorDf = pd.read_excel('KEGG_SEED_DO.xls')
kt = keggTranslatorDf #short name for easier typing

# translate KEGG ID to long name
ktn = kt.loc[:,['KEGG ID(S)', 'PRIMARY NAME']]
ktn['KEGG ID(S)'] = ktn['KEGG ID(S)'].str.lower() # set to lower case for better matching
ktn.set_index('KEGG ID(S)', inplace = True)

# translate long name to KEGG ID
# the original dictionaries sometimes had trouble with duplicate KEGG IDs. If there are duplicates, make sure to choose the lowest number
ntk = kt.loc[:,['PRIMARY NAME', 'KEGG ID(S)']].sort_values(by = ['KEGG ID(S)'], ascending = True)
ntk['PRIMARY NAME'] = ntk['PRIMARY NAME'].str.lower() # set to lower case for better matching
ntk = ntk.groupby('PRIMARY NAME').first() # take the first KEGG ID in each group

# translate KEGG ID to abbreviation
kta = kt.loc[:,['KEGG ID(S)', 'ABBREVIATION']]
kta['KEGG ID(S)'] = kta['KEGG ID(S)'].str.lower() # set to lower case for better matching
kta.set_index('KEGG ID(S)', inplace = True)

# translate abbreviation to KEGG ID
atk = kt.loc[:,['ABBREVIATION', 'KEGG ID(S)']]
atk['ABBREVIATION'] = atk['ABBREVIATION'].str.lower() # set to lower case for better matching
atk.set_index('ABBREVIATION', inplace = True)

atkDict = dict(zip(atk.index, atk['KEGG ID(S)'].str.upper()))

## Set up model
* Choose reactions
* Set fluxes
* Set concentration bounds
* Set pH and ionic strength

In [4]:
os.chdir('/Users/satyakam/Dropbox/work/component_contribution_ctherm')

In [5]:
allRxnDf = pd.read_excel('cth_thermo_model_DO_SD2.xlsx', sheet_name = 'reactions')
allRxnDf[:5]

Unnamed: 0,Name,PlaintextFormula,AbbreviationFormula,KeggFormula,Notes,ppi-pfk,atp-pfk,mal-snt,pdc,gapn,pyk,aldh-ndp,adh-ndp,mal-gapn,tsac,test
0,ATPM,H2O + ATP <=> ADP + Phosphate,h2o + atp <=> adp + pi,C00001 + C00002 <=> C00008 + C00009,ATP hydrolysis for cellobiose transport,1,1,1,1,1,1,1,1,1,1,1
1,CBP,Phosphate + Cellobiose <=> D-Glucose + Glucose...,pi + cellb <=> glc-D + g1p,C00009 + C00185 <=> C00031 + C00103,,1,1,1,1,1,1,1,1,1,0,1
2,BGL,Cellobiose + H2O <=> 2 D-Glucose,cellb + h2o <=>2 glc-D,C00185 + C00001 <=> 2 C00031,Beta glucosidase,0,0,0,0,0,0,0,0,0,1,0
3,GLK-GTP,D-Glucose + GTP <=> D-Glucose-6-phosphate + GDP,glc-D + gtp <=> g6p + gdp,C00031 + C00044 <=> C00092 + C00035,,1,1,1,1,1,1,1,1,1,0,1
4,GLK-ATP,D-Glucose + ATP <=> D-Glucose-6-phosphate + ADP,glc-D + atp <=> g6p + adp,C00031 + C00002 <=> C00092 + C00008,,0,0,0,0,0,0,0,0,0,2,0


In [6]:
# choose a flux set, and drop all of the zero-flux reactions
fluxSet = 'ppi-pfk'
selectedRxnDf = allRxnDf.loc[allRxnDf[fluxSet] != 0, ['Name', 'AbbreviationFormula', 'KeggFormula', fluxSet]]
selectedRxnDf.rename(columns = {fluxSet:'flux'}, inplace = True) # rename the flux columns to 'flux' to simplify subsequent                                                        # processing
selectedRxnDf

Unnamed: 0,Name,AbbreviationFormula,KeggFormula,flux
0,ATPM,h2o + atp <=> adp + pi,C00001 + C00002 <=> C00008 + C00009,1
1,CBP,pi + cellb <=> glc-D + g1p,C00009 + C00185 <=> C00031 + C00103,1
3,GLK-GTP,glc-D + gtp <=> g6p + gdp,C00031 + C00044 <=> C00092 + C00035,1
5,PGMT,g1p <=> g6p,C00103 <=> C00092,1
6,PGI,g6p <=> f6p,C00092 <=> C00085,2
7,PFK-PPi,ppi + f6p <=> pi + fdp + h,C00013 + C00085 <=> C00009 + C00354 + C00080,2
9,FBA,fdp <=> dhap + g3p,C00354 <=> C00111 + C00118,2
10,TPI,dhap <=> g3p,C00111 <=> C00118,2
11,GAPDH,pi + nad + g3p <=> nadh + 13dpg,C00009 + C00003 + C00118 <=> C00004 + C00236,4
14,PGK-GTP,gdp + 13dpg <=> gtp + 3pg,C00035 + C00236 <=> C00044 + C00197,4


In [7]:
#Import metabolite data from excel file
Alldata = pd.read_excel('./metabolite_data/lt_dataset2_quantified.xlsx')
Met_data = Alldata.loc[Alldata['Replicate'] != 0, ['Timepoint', 'kegg', 'compound','amount', 'Sample','Replicate']]
#Met_data

In [8]:
# parse the KeggFormula column to create eQuilibrator Reaction objects
# create reaction list
reactions = []
for i, row in selectedRxnDf.iterrows():
    rxn = Reaction.parse_formula(row['KeggFormula'], rid = row['Name'])
    if (rxn.check_full_reaction_balancing()):
        reactions.append(rxn)
    else:
        print('Error: reaction {} is not balanced'.format(row['AbbreviationFormula']))
        
# create flux list
fluxes = selectedRxnDf['flux'].values

# need to declare pH and ionic strength constants at the beginning
PH = 7.0
IS = 0.1

# calculate dGO_r_primes
dG0_r_primes = []
for r in reactions:
    result = r.dG0_prime(pH = PH, ionic_strength = IS)
    #print(result)
    dG0_r_primes.append(result)

In [9]:
# set up the model
pp = Pathway(reactions = reactions, fluxes = fluxes, dG0_r_primes = dG0_r_primes)

In [10]:
# set up the reaction bounds
# make a dataframe with the upper and lower bounds
def bounds_to_df(compounds, bounds):
    """
    Make a dataframe with concentrations
    Args:
        compounds = list of KEGG IDs
        bounds = equilibrator-api bounds object
    """
    boundsDf = pd.DataFrame(compounds, columns = ['Compound:Identifiers:kegg.compound']) # I had to choose this clunky column name to make it work with bounds.py 
                                                                                                    # from_dataframe method
    boundsDf['Name'] = boundsDf['Compound:Identifiers:kegg.compound'].apply(lambda x: kta.loc[x.lower()]) # add abbreviations
    boundsDf['Concentration:Max'] = boundsDf['Compound:Identifiers:kegg.compound'].apply(lambda x: bounds.GetUpperBound(x))
    boundsDf['Concentration:Min'] = boundsDf['Compound:Identifiers:kegg.compound'].apply(lambda x: bounds.GetLowerBound(x))
    return boundsDf

In [17]:
saveDirs = ["P1_ppi_ratio"]
ppi_ratio = [1,2,3,4,5,6,7,8,9,10,12,15,18,20]
pi_ratio = [1,2,3,4,5,6,7,8,9,10]
exp_data_file = "ctherm_exp_data.txt"
exp_data = []
all_data = {}

Uf = 2 #Uncertainity factor 
ref_conc0 = {#'C00469': 1,   #ethanol
            'C00004': 0.08,  #NADH
            'C00024': 0.83,  #Acetyl-CoA
            'C00002': 2.70,  #ATP
            'C00008': 0.11,  #ADP
            'C00020': 0.22,  #AMP
            'C00354': 1.50,  #FBP
            'C00092': 8.19,  #G6P
            'C00074': 0.69,  #Phosphoenolpyruvate
            'C00005': 0.38,  #NADPH
            'C00022': 12.65, #Pyruvate
            'C00103': 6.66,  #G1P
            'C00044': 0.28,  #GTP
            'C00035': 0.01,  #GDP        
            'C00085': 1.49,  #F6P 
            'C00103': 6.66,  #G1P 
            'C00118': 0.10,  #G3P 
            'C00197': 1.35,  #3PG
            'C00011': 1.27,  #CO2
            'C00003': 2.25,  #NAD 
            'C00006': 0.26,  #NADP 
            'C00010': 0.02,  #CoA
            }

ref_conc = {k: float(v) / 1000 for k,v in ref_conc0.items()}
ref_t0={};
for t in range(3):
    for cpd2, conc2 in ref_conc.items(): # this loop index represents the fompound to be excluded from constraint list
        for cpd, conc in ref_conc.items(): # this loop iterates over compound list to genrate bounds for mdf simulation
            pp.bounds.SetBounds(cpd, 1e-6,0.1) # reset bounds at the start of each iteration
            if t == 0 and cpd!=cpd2 : 
                pp.bounds.SetBounds(cpd, conc / Uf,conc * Uf)
                pp.bounds.SetBounds('C00469', 5/46 , 5/46) # ethanol conc is fixed at the measured value
                for i, dat in Met_data.iterrows():
                    if dat['Sample']=='A' and dat['Timepoint']==2 and dat['Replicate']==1:
                        if dat['kegg'] == cpd:
                            ref_t0[cpd]= dat['amount']           
            elif t == 1 and cpd!=cpd2:
                for i, dat in Met_data.iterrows():
                    if dat['Sample']=='A' and dat['Timepoint']==5 and dat['Replicate']==1:
                        if dat['kegg'] == cpd:
                            pp.bounds.SetBounds(cpd, conc * dat['amount'] / (ref_t0[cpd]*Uf),conc * dat['amount'] * Uf/(ref_t0[cpd]))
                        elif dat['kegg'] == 'C00118':
                            pp.bounds.SetBounds(cpd, 0.001,conc * dat['amount'] * Uf/(ref_t0[cpd]))
                            #print(t,dat['amount'],cpd)
                    pp.bounds.SetBounds('C00469', 20/46 , 20/46)
            elif t == 2 and cpd!=cpd2:
                for i, dat in Met_data.iterrows():
                    if dat['Sample']=='A' and dat['Timepoint']==9 and dat['Replicate']==1:
                        if dat['kegg'] == cpd:
                            pp.bounds.SetBounds(cpd, conc * dat['amount'] / (ref_t0[cpd]*Uf),conc * dat['amount'] * Uf/(ref_t0[cpd]))
                        elif dat['kegg'] == 'C00118':
                            pp.bounds.SetBounds(cpd, 0.001,conc * dat['amount'] * Uf/(ref_t0[cpd]))
                            #print(t,dat['amount'],cpd)
                    pp.bounds.SetBounds('C00469', 40/46 , 40/46)
            elif cpd in ["C00002","C00008","C00020","C00044","C00035","C00003","C00004","C00005","C00006"]:
                pp.bounds.SetBounds(cpd, conc ,conc)
                
        mdf_data = pp.calc_mdf()
        #print(mdf_data.model.concentration_bounds)
        if mdf_data.mdf>0 and t >0:
            mdf_data, con = pp.conc_mdf()
            print(cpd2,t,mdf_data.mdf)
            k=0
            for c in mdf_data.compound_data:
                print(c.compound, mdf_data.model.concentration_bounds.GetBoundTuple(c.compound),con[k])
                k=k+1
#the following loop is for the case with no externally added ethanol case or Sample 'B'
for t in range(3):
    for cpd, conc in ref_conc.items():
        pp.bounds.SetBounds(cpd, 0.000001,0.01) # reset bounds at the start of each iteration
        if t == 0:
            pp.bounds.SetBounds(cpd, conc / Uf,conc * Uf)
            for i, dat in Met_data.iterrows():
                if dat['Sample']=='B' and dat['Timepoint']==2 and dat['Replicate']==1:
                    if dat['kegg'] == cpd:
                        ref_t0[cpd]= dat['amount']           
        elif t == 1:
            for i, dat in Met_data.iterrows():
                if dat['Sample']=='B' and dat['Timepoint']==5 and dat['Replicate']==1:
                    if dat['kegg'] == cpd:
                        pp.bounds.SetBounds(cpd, conc * dat['amount'] / (ref_t0[cpd]*Uf),conc * dat['amount'] * Uf/(ref_t0[cpd]))
                        #print(t,dat['amount'],cpd)
        elif t == 2:
            for i, dat in Met_data.iterrows():
                if dat['Sample']=='B' and dat['Timepoint']==9 and dat['Replicate']==1:
                    if dat['kegg'] == cpd:
                        pp.bounds.SetBounds(cpd, conc * dat['amount'] / (ref_t0[cpd]*Uf),conc * dat['amount'] * Uf/(ref_t0[cpd]))
                        #print(t,dat['amount'],cpd)
        elif cpd in ["C00002","C00008","C00020","C00044","C00035","C00003","C00004","C00005","C00006"]:
            pp.bounds.SetBounds(cpd, conc ,conc)
    mdf_data = pp.calc_mdf()
    print(mdf_data.mdf)

C00354 1 0.0821949816753218
C00001 (1.0, 1.0) (1.0, 1.0)
C00002 (0.0037377392190599446, 0.014950956876239778) (0.003737739219059945, 0.0050373872989990165)
C00003 (0.00044992760225239653, 0.0017997104090095861) (0.00044992760225239653, 0.00099303271450602)
C00004 (2.813011243421816e-05, 0.00011252044973687263) (5.098125712772349e-05, 0.00011252044973687293)
C00008 (0.00018542214674709644, 0.0007416885869883857) (0.00018542214674709654, 0.0007416885869883856)
C00009 (0.01, 0.01) (0.010000000000000004, 0.010000000000000004)
C00010 (6.179377823661742e-06, 2.471751129464697e-05) (6.179377823661744e-06, 9.832232534815706e-06)
C00011 (1e-06, 0.1) (1.0000000000000004e-06, 0.10000000000000002)
C00013 (0.001, 0.001) (0.0009999999999999994, 0.0009999999999999994)
C00020 (0.0001845217379274285, 0.000738086951709714) (0.0005476602021508252, 0.0007380869517097137)
C00022 (0.010531568392518387, 0.04212627357007355) (0.010531568392518391, 0.014193496589725782)
C00024 (7.407573247803231e-06, 2.9630292

In [None]:
import seaborn as sns
sns.set()
fng_data={}
for r in [1,2,3,4,5,6,7,8,9,10,12,15,18,20]:
    fng_data[r]={}
    for r2 in range(10):
        fng_data[r][r2] = all_data[r][r2+1][9]["reaction prices"]["PFK-PPi"]
import pandas as pd
df = pd.DataFrame.from_dict(fng_data)
sns.heatmap(df)
#xlabel('PPi conc in mM')
#ylabel('timepoints')

In [None]:
fng_data={}
for r in [1,2,3,4,5,6,7,8,9,10,12,15,18,20]:
    fng_data[r]={}
    for r2 in range(10):
        fng_data[r][r2] = all_data[r][r2+1][8]["mdf"]
        if fng_data[r][r2] < 0:
            fng_data[r][r2] = -10
import pandas as pd
df = pd.DataFrame.from_dict(fng_data)
sns.heatmap(df)
#xlabel('PPi conc in mM')
#ylabel('timepoints')

In [None]:
import json
with open('mdf_ppi_pi.json', 'w') as fp:
    json.dump(all_data, fp, indent=2)

In [None]:
import matplotlib.pyplot as plt
mdf_data={}
for r in [1,2,3,4,5,6,7,8,9,10,12,15,18,20]:
    mdf_data[r]={}
    for t in range(10):
        mdf_data[r][t] = all_data[r][t]["mdf"]
y={}
for r in [1,2,3,4,5,6,7,8,9,10,12,15,18,20]:
    y[r] = float(mdf_data[r][9]) 
plt.plot(*zip(*sorted(y.items())))
plt.xlabel('PPI conc in mM')
plt.ylabel('MDF at timepoint 9')
plt.title('MDF vs PPI conc.')

In [None]:
fng_data={}
for t in range(10):
    fng_data[t] = all_data[8][10][t]["reaction prices"]
df = pd.DataFrame.from_dict(fng_data)
sns.heatmap(df)

In [None]:
y={}
for t in range(10):
    y[t] = float(mdf_data[6][t]) 
plt.plot(*zip(*sorted(y.items())))
plt.xlabel('timepoints')
plt.ylabel('MDF at PPi conc of 10mM')
plt.title('MDF vs time points')