# Using thermodynamic analysis to guide metabolic engineering
DO 12-6-2018  
Using Python 3 and eQuilibrator API  

DO 12-17-2018  
Looking at different ethanol ratios

DO 12-19-2018  
Using measured metabolite data to further constrain the system


In [18]:
import numpy as np
from numpy import array, eye, log, zeros
import pandas as pd
from equilibrator_api import Reaction, ComponentContribution, ReactionMatcher, CompoundMatcher, ParseError, Pathway
from equilibrator_api.bounds import Bounds, DEFAULT_BOUNDS
import seaborn as sns
from matplotlib.ticker import FormatStrFormatter
import copy # for copying pp objects
%matplotlib inline

## Set up translator for KEGG IDs
Note: I set these up as dataframes because I was troubleshooting an issue with duplicate KEGG IDs. Eventually I think these should be set up as dictionaries, to make the code more readable

In [3]:
# set up the reaction bounds
# make a dataframe with the upper and lower bounds
def bounds_to_df(compounds, bounds):
    """
    Make a dataframe with concentrations
    Args:
        compounds = list of KEGG IDs
        bounds = equilibrator-api bounds object
    """
    boundsDf = pd.DataFrame(compounds, columns = ['Compound:Identifiers:kegg.compound']) # I had to choose this clunky column name to make it work with bounds.py 
                                                                                                    # from_dataframe method
    boundsDf['Name'] = boundsDf['Compound:Identifiers:kegg.compound'].apply(lambda x: kta.loc[x.lower()]) # add abbreviations
    boundsDf['Concentration:Max'] = boundsDf['Compound:Identifiers:kegg.compound'].apply(lambda x: bounds.GetUpperBound(x))
    boundsDf['Concentration:Min'] = boundsDf['Compound:Identifiers:kegg.compound'].apply(lambda x: bounds.GetLowerBound(x))
    return boundsDf

In [4]:
# make a dictionary to translate KEGG IDs into human-readable abbreviations
keggTranslatorDf = pd.read_excel('KEGG_SEED_DO.xls')
kt = keggTranslatorDf #short name for easier typing

# translate KEGG ID to abbreviation
kta = kt.loc[:,['KEGG ID(S)', 'ABBREVIATION']]
kta['KEGG ID(S)'] = kta['KEGG ID(S)'].str.lower() # set to lower case for better matching
kta.set_index('KEGG ID(S)', inplace = True)

atkDict = dict(zip(kt['ABBREVIATION'].values, kt['KEGG ID(S)'].values))
ktaDict = dict(zip(kt['KEGG ID(S)'].values, kt['ABBREVIATION'].values))

## Set up model
* Choose reactions
* Set fluxes
* Set pH and ionic strength

In [5]:
# need to declare pH and ionic strength constants at the beginning
# they are used in the make_pathway function
PH = 7.0
IS = 0.1

# load stoichiometric model from Excel file
allRxnDf = pd.read_excel('cth_thermo_model_DO_SD2.xlsx', # using Satya's version
                         sheet_name = 'reactions')
# make list of flux sets from excel file column names
# all of the column names from 'ppi-pfk' to 'mal-gapn' inclusive
fluxSets = allRxnDf.loc[:,'ppi-pfk':].columns.values 

# load bounds information from the model
boundsDf = pd.read_excel('cth_thermo_model_DO_SD2.xlsx', # using Satya's version
                         sheet_name = 'metabolite_bounds')
allRxnDf[:5]

Unnamed: 0,Name,PlaintextFormula,AbbreviationFormula,KeggFormula,Notes,ppi-pfk,atp-pfk,mal-snt,pdc,gapn,pyk,aldh-ndp,adh-ndp,mal-gapn,tsac
0,CBtx,H2O + ATP <=> ADP + Phosphate,h2o + atp <=> adp + pi,C00001 + C00002 <=> C00008 + C00009,ATP hydrolysis for cellobiose transport,1,1,1,1,1,1,1,1,1,1
1,CBP,Phosphate + Cellobiose <=> D-Glucose + Glucose...,pi + cellb <=> glc-D + g1p,C00009 + C00185 <=> C00031 + C00103,,1,1,1,1,1,1,1,1,1,0
2,BGL,Cellobiose + H2O <=> 2 D-Glucose,cellb + h2o <=>2 glc-D,C00185 + C00001 <=> 2 C00031,Beta glucosidase,0,0,0,0,0,0,0,0,0,1
3,GLK-GTP,D-Glucose + GTP <=> D-Glucose-6-phosphate + GDP,glc-D + gtp <=> g6p + gdp,C00031 + C00044 <=> C00092 + C00035,,1,1,1,1,1,1,1,1,1,0
4,GLK-ATP,D-Glucose + ATP <=> D-Glucose-6-phosphate + ADP,glc-D + atp <=> g6p + adp,C00031 + C00002 <=> C00092 + C00008,,0,0,0,0,0,0,0,0,0,2


In [8]:
def make_pathway(fluxSet, allRxnDf):
    """
    Given a fluxSet name, return the parsed pathway object
    
    Args:
       fluxSet: is the name of a column in the Excel model file
       allRxnDf: a dataframe with the model reactions
    """
    # choose a flux set, and drop all of the zero-flux reactions
    selectedRxnDf = allRxnDf.loc[allRxnDf[fluxSet] != 0, ['Name', 'AbbreviationFormula', 'KeggFormula', fluxSet]]
    selectedRxnDf.rename(columns = {fluxSet:'flux'}, inplace = True) # rename the flux columns to 'flux' to simplify subsequent
                                                                     # processing
    #selectedRxnDf
    # parse the KeggFormula column to create eQuilibrator Reaction objects
    # create reaction list
    reactions = []
    for i, row in selectedRxnDf.iterrows():
        rxn = Reaction.parse_formula(row['KeggFormula'], rid = row['Name'])
        if (rxn.check_full_reaction_balancing()):
            reactions.append(rxn)
        else:
            print('Error: reaction {} is not balanced'.format(row['AbbreviationFormula']))

    # create flux list
    fluxes = selectedRxnDf['flux'].values

    # calculate dGO_r_primes
    dG0_r_primes = []
    for r in reactions:
        result = r.dG0_prime(pH = PH, ionic_strength = IS)
        #print(result)
        dG0_r_primes.append(result)

    # set up the model
    pp = Pathway(reactions = reactions, fluxes = fluxes, dG0_r_primes = dG0_r_primes, 
                 pH = PH, ionic_strength = IS, bounds = None)
    return pp

In [51]:
pp1 = make_pathway('ppi-pfk', allRxnDf)
pp1.bounds.SetBounds('COOOO1', 2,7 )
pp2 = make_pathway('atp-pfk', allRxnDf)
pp2.bounds.SetBounds('COOOO1', 3,23 )
print(pp1.bounds.GetBoundTuple('COOOO1'))
print(pp2.bounds.GetBoundTuple('COOOO1'))
print(DEFAULT_BOUNDS.GetBoundTuple('COOOO1'))

print(id(pp1.bounds))
print(id(pp2.bounds))
print(id(DEFAULT_BOUNDS))


(3, 23)
(3, 23)
(3, 23)
1982057204144
1982057204144
1982057204144


In [41]:
print(id(pp3.bounds))
print(id(pp4.bounds))
print(id(pp5.bounds))
print(id(pp6.bounds))
print(id(pp7.bounds))
print(id(pp10.bounds))
print(id(pp11.bounds))

b2 = Bounds.from_csv_filename('data/cofactors.csv')
b3 = Bounds.from_csv_filename('data/cofactors.csv')
print(id(b2))
print(id(b3))

1982057204144
1980234321872
1982057204144
1980225131016
1982057204144
1982057204144
1980226951936
1980227291008
1980227200056


In [39]:
pp11.bounds.SetBounds(atkDict['etoh'], 2,7 )
print(pp10.bounds.GetBoundTuple(atkDict['etoh']))
print(pp11.bounds.GetBoundTuple(atkDict['etoh']))
print(b2.GetBoundTuple(atkDict['etoh']))
 

(1, 10)
(2, 7)
(1e-06, 0.01)


In [38]:
b2.

<equilibrator_api.bounds.Bounds at 0x1cd0f29e748>

## Get measured metabolite data and use it to set bounds
* There are several datasets to choose from. For now, I'm going to focus on dataset 2, which we think is the highest quality.
* Note that the originaly analysis Josh did was based on dataset 1, which has more timepoints, but the sampling was done aerobically instead of anaerobically

In [None]:
# get processed metabolite data
metDf = pd.read_excel(r'metabolite_data/lt_dataset2_quantified.xlsx')

# Translate metabolite names to KEGG IDs using KNOWNS csv file
# The plaintext names of the compounds are slightly different from whats in the KEGG_SEED file
# Fortunately, the file also has KEGG IDs
stdsDf = pd.read_csv('metabolite_data/KNOWNS-Dan121218v2.csv')
stdsDf.set_index('compound', inplace = True)

# map KEGG IDs to the metDf dataframe
metDf['KEGG_ID'] = metDf['compound'].apply(lambda x: stdsDf.loc[x, 'id'])

# Fix ethanol data so that it's similar to other metabolites
etohGtoM = 46.07
etDf = metDf.loc[:, ('ID', 'Sample', 'Ethanol', 'Timepoint', 'Time (h)','Replicate', ' EtOH concentraion (g/L)')] # watch out for space in front of EtOH column name
etDf = etDf.drop_duplicates()
etDf['KEGG_ID'] = 'C00469' # KEGG ID for ethanol
etDf['compound'] = 'ethanol'
etDf['amount_int'] = etDf[' EtOH concentraion (g/L)']*(1/etohGtoM)*1e6 # ethanol concentration in the cells is the same as the concentration in the bulk solution, units of uM

# append ethanol data to existing metabolite dataframe
metDf2 = metDf.append(etDf, sort = False)
metDf2[:5]

In [None]:
# set bounds
# start with a pp object
# for each timepoint, make a new pp object, with a different set of bounds
def set_model_bounds(pp, metaboliteDf, metsToExclude = None):
    """
    Given a parsed pathway (pp) object and a dataframe with metabolite values,
    Generate a dataframe of new pp objects, each one with bounds corresponding to a timepoint
    from the metabolite data
    
    Args:
        pp: a parsed pathway object from equilibrator-api
        metaboliteDf: a dataframe with the following columns
           'Sample'
           'Time (h)'
           'KEGG_ID'
           'amount_int' - intracellular metabolite concentration in uM
        metsToExclude: a list of KEGG IDs to skip over when setting bounds
                       for example, metabolites whose concentration data we
                       don't trust
    """
    ppOriginal = pp

In [None]:
g = metDf2.groupby(['Sample', 'Replicate', 'Timepoint'])
for name, group in g:
    # loop through each metabolite in the group
    for row in group.iterrows():
        

In [None]:
for key in g.groups.keys():
    print(key)

In [None]:
g.get_group(('A', 1, 2))

In [None]:
def set_model_bounds(pp, setType = 'Cofactor'):
    """
    Given a pathway object, set bounds
    setType can be either 'Cofactor' or 'All'
    assume that boundsDf has already been created from the model file
    """
    newBoundsDf = boundsDf[boundsDf['Type'] == setType]
    for idx, metabolite in newBoundsDf.iterrows():
        pp.bounds.SetBounds(metabolite['KEGG_ID'], metabolite['Concentration:Min'], metabolite['Concentration:Max'])
        
    return pp
    

## Set up plotting

In [None]:
# function for plotting the results
def make_plots(fluxSet,  mdf_values, et_conc, compDf, rxnDf, netRxns, savePdf = False):
    import matplotlib.pyplot as plt
    from matplotlib.gridspec import GridSpec

    fig = plt.figure( figsize=(15,15))
    fig.suptitle('Analysis of flux set: {}'.format(fluxSet), fontsize=16, )
    plt.figtext(x=0, y=0.95, s=netRxns[fluxSet], fontsize=10, wrap = True)
    gs = GridSpec(4,2, figure=fig)

    ax1 = fig.add_subplot(gs[0, 0]) # for MDF plot
    ax2 = fig.add_subplot(gs[1:, 0]) # for shadow price plot
    ax3 = fig.add_subplot(gs[:2, 1]) # metabolite concentrations
    ax4 = fig.add_subplot(gs[2:, 1]) # metabolite shadow prices

    # MDF plot
    ax1.plot(et_conc, mdf_values, '-o')
    ax1.set_title('MDF vs ethanol concentration')
    ax1.set_ylabel('MDF')
    ax1.set_xlabel('ethanol concentration (M)')
    ax1.set_xscale('log')
    ax1.axhline(y=0, color='r', linestyle='--')
    
    # reaction shadow price plot
    df = rxnDf.loc[:, 'shadow_price']
    sns.heatmap(df, linewidths = 0.05, ax = ax2, cmap='coolwarm', fmt = '.1f', annot = True, 
                annot_kws={'size':8, 'rotation':90}, vmin=-1, vmax=1)
    ax2.set_title('Reaction Shadow Price')
    ax2.set_ylabel('Reaction')
    ax2.set_xlabel('Ethanol concentration (M)')
    
    # concentration plot
    df = compDf.loc[:, 'concentration'].apply(np.log10)
    sns.heatmap(df, ax = ax3, cmap='coolwarm', linewidths = 0.05, fmt = '.1f', annot = True, annot_kws={'size':8},
               vmin=-6, vmax=-2)
    ax3.set_title('Log Concentration')
    ax3.set_ylabel('Metabolite')
    ax3.set_xlabel('Ethanol concentration (M)')

    # metabolite shadow price plot
    df = compDf.loc[:, 'shadow_price'] #.apply(np.log10)
    sns.heatmap(df, ax = ax4, cmap='coolwarm', linewidths = 0.05, fmt = '.1f', annot = True, annot_kws={'size':8},
               vmin=-2, vmax=2)
    ax4.set_title('Metabolite Shadow Price')
    ax4.set_ylabel('Metabolite')
    ax4.set_xlabel('Ethanol concentration (M)')

    plt.tight_layout(rect=[0, 0.03, 1, 0.93])
    
    # save the results as a PDF if that flag is set to True
    if savePdf:
        plt.savefig('etoh_analysis_{}.pdf'.format(fluxSet), bbox_inches = "tight")

## Analyze all flux sets

In [None]:
rm = ReactionMatcher() # for translating net reactions to plaintext
fluxSets = ['ppi-pfk'] # for testing, when I don't want to run through all of the flux sets
#fluxSets = allRxnDf.loc[:,'ppi-pfk':].columns.values 

et_conc = np.geomspace(0.01, 8, 15) # 15 points geometrically spaced from 0.01 to 4
#et_conc = [1.0] # for testing a single concentration of ethanol
netRxns = {} # dictionary to hold net reaction strings for each flux set

for fluxSet in fluxSets:
    print('\n\n\n********* Working on flux set {} ************'.format(fluxSet))
    pp = makePathway(fluxSet, allRxnDf)
    pp = set_model_bounds(pp) # set cofactor concentrations based on 'metabolite_bounds' sheet from Excel model
    # determine net reaction in plaintext
    dG0p = pp.net_reaction().dG0_prime() # float
    rnxTxt = rm.write_text_formula(pp.net_reaction())
    netRxnTxt = '∆G0={0:.2f}\n{1}'.format(dG0p, rnxTxt)
    netRxns[fluxSet] = netRxnTxt

    # calculate the min-max driving force for different ethanol concentrations
    mdf_result = [] # store list of mdf_result objects, in case we want to inspect them later
    mdf_values = [] # store MDF values for making MDF vs. ethanol plot
    compoundDfList = [] # hold report_compounds dataframes
    rxnDfList = [] # hold report_reactions dataframes
    for idx, c in enumerate(et_conc):
        pp.bounds.SetBounds(atkDict['etoh'], c,c )
        result = pp.calc_mdf()
        #result.mdf_plot.show() # show MDF vs. reaction plot
        mdf_result.append(result)
        mdf_values.append(result.mdf)

        # clean up dataframe for compounds
        df = pd.DataFrame(result.report_compounds)
        df['mets'] = df['compound'].map(ktaDict)
        df.set_index('mets', drop = True, inplace = True)
        nadRatio = df.loc['nad', 'concentration'] / df.loc['nadh', 'concentration'] # calculate nadRatio
        mi = pd.MultiIndex.from_product([[et_conc_rnd[idx]], df.columns], names = ['et_conc', 'data'])
        df.columns = mi
        compoundDfList.append(df)

        # clean up dataframe for reactions
        df2 = pd.DataFrame(result.report_reactions)
        rxnIds = list(r.reaction_id for r in pp.reactions)
        df2['name'] = rxnIds
        df2.set_index('name', drop = True, inplace = True)
        mi2 = pd.MultiIndex.from_product([[et_conc_rnd[idx]], df2.columns], names = ['et_conc', 'data'])
        df2.columns = mi2
        rxnDfList.append(df2)
        
        print('ethanol conc = {:.2f}, NAD/NADH ratio = {:.2f}'.format(et_conc[idx], nadRatio))
        
        # for checking metabolite bounds
        #mets = compDf.loc[:,('compound', 0.01)].values
        #bDf = bounds_to_df(mets, pp.bounds)
        #display(bDf)
        

    # merge all of the report_compound dataframes into one big one
    # arrange the compound results in a dataframe to make plotting easy
    compDf = pd.concat(compoundDfList, axis=1)
    compDf = compDf.swaplevel(axis = 'columns')
    
    rxnDf = pd.concat(rxnDfList, axis=1)
    rxnDf = rxnDf.swaplevel(axis = 'columns')
    
    # plot the results
    make_plots(fluxSet, mdf_values, et_conc, compDf, rxnDf, netRxns, savePdf = False)

In [None]:
# checking adenylate charge
df2 = df.loc[:,8.0]
atp = df2.loc['atp', 'concentration']
adp = df2.loc['adp', 'concentration']
amp = df2.loc['amp', 'concentration']

atpCharge = (atp + 0.5*adp)/(atp+adp+amp)
print(atp)
print(adp)
print(amp)
print(atpCharge)

In [None]:
make_plots(fluxSet, mdf_values, et_conc, compDf, rxnDf, netRxns, savePdf = False)