In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import json
import numpy as np


In [2]:
def get_soup_for_ecmdb_ID(ecmdb_id):
    response = requests.get(f'https://ecmdb.ca/compounds/{ecmdb_id}')
    soup = BeautifulSoup(response.text, 'lxml-xml')
    xml_string = soup.find('concentrations')
    return xml_string

In [3]:
def parse_concentrations_ecmdb(soup):
    # Convert the soup to a string for parsing
    tree = ET.fromstring(str(soup))
    
    # Initialize a list to store extracted data
    data = []
    
    # Iterate over all elements in the XML and collect their tags and text values
    index = 0
    rows = []
    row = {}
    for element in tree.iter():
        text = element.text.strip() if element.text else ''
        if not len(text):
            continue
        if row.get(element.tag):
            print("New!")
            rows.append(row)
            row = {}
        row[element.tag] = text
    rows.append(row)
    # Convert the data into a DataFrame for tabular display
    df = pd.DataFrame(rows)
    
    return df

In [4]:
def get_ecmdb_conc(ecmdb_id):
    xml_string = get_soup_for_ecmdb_ID(ecmdb_id)
    return parse_concentrations_ecmdb(xml_string)

# Get ECMDB id for all metabolites

In [3]:

data_folder = Path('../../data')
met_info_fn = data_folder / 'this_project' / '5_div' / '5C_metabolite_info.csv'
met_info = pd.read_csv(met_info_fn, index_col=0)

In [5]:
all_conc_fn = data_folder / 'this_project' / '5_div' / '5E_intracellular_concentrations.csv'


In [6]:
ecmdb_fn = data_folder / 'Other' / 'ecmdb.json' 
with open(ecmdb_fn, 'r') as f:
    ecmdb = json.load(f)

In [7]:
ecm_df = pd.DataFrame(ecmdb)

# Used to get ECMDB IDs from names matched by Gemini

In [20]:
ecmdb_mapping_fn = 'metabolite_to_ecmdb.csv'
conc_df = pd.read_csv(ecmdb_mapping_fn)

In [21]:
ecmdb_id = []
for i, row in conc_df.iterrows():
    name = row['ECMDB name']
    dfi = ecm_df.loc[ecm_df.name==name]
    if len(dfi)==1:
        ecmdb_id.append(dfi['met_id'].values[0])
    else:
        ecmdb_id.append(None)
conc_df['ECMDB ID'] = ecmdb_id

In [50]:
conc_df.to_csv(ecmdb_mapping_fn)

# Extract all concentration data from ECMDB for relecant metabolites

In [64]:
conc_data = {}
for i, row in conc_df.iterrows():
    if row['ECMDB ID']:
        conc_dfi = get_ecmdb_conc(row['ECMDB ID'])
        if conc_dfi.size:
            conc_data[row['Metabolite']] = conc_dfi

New!
New!


KeyboardInterrupt: 

In [125]:
conc_dfi = get_ecmdb_conc('ECMDB20123')
conc_dfi['Metabolite'] = 'Fructose-1,6-bisphosphate'
conc_dfi['#'] = list(np.arange(len(conc_dfi)))
# conc_data['Alpha-ketoglutarate'] = conc_dfi

New!
New!
New!
New!
New!
New!


In [126]:
all_conc = pd.concat([all_conc, conc_dfi], ignore_index=True).reset_index(drop=True)

In [36]:
all_conc = pd.concat(conc_data).reset_index()

In [38]:
all_conc.rename(columns={'level_0': 'Metabolite', 'level_1': '#'}, inplace=True)

In [127]:
all_conc.to_csv(all_conc_fn)

In [112]:
all_conc = pd.read_csv(all_conc_fn, index_col=0)

In [305]:
for i, row in conc_df.iterrows():
    conc_i = all_conc.loc[all_conc.Metabolite==row['Metabolite']]
    conc_df.at[i, '# data points'] = int(len(conc_i))
    

In [306]:
conc_df.head(40)

Unnamed: 0,Metabolite,ECMDB name,ECMDB ID,# data points
0,2-phosphoglycerate,2-Phospho-D-glyceric acid,ECMDB04129,1.0
1,3-phosphoglycerate,3-Phosphoglycerate,ECMDB04131,3.0
2,Acetate,Acetic acid,ECMDB00042,1.0
3,Alanine,L-Alanine,ECMDB00161,9.0
4,Alpha-aminoadipate,Aminoadipic acid,ECMDB00510,0.0
5,Alpha-aminobutyrate,,,0.0
6,Alpha-ketoglutarate,alpha-Ketoglutarate,ECMDB02812,6.0
7,Arginine,L-Arginine,ECMDB00517,6.0
8,Asparagine,L-Asparagine,ECMDB00168,8.0
9,Aspartate,L-Aspartic acid,ECMDB00191,8.0


## Categorize media

In [307]:
media_categorization = {
    'Gutnick minimal complete medium (4.7 g/L KH2PO4; 13.5 g/L K2HPO4; 1 g/L K2SO4; 0.1 g/L MgSO4-7H2O; 10 mM NH4Cl) with 4 g/L glucose': ('Minimal', 'Glucose', 4),
'Gutnick minimal complete medium (4.7 g/L KH2PO4; 13.5 g/L K2HPO4; 1 g/L K2SO4; 0.1 g/L MgSO4-7H2O; 10 mM NH4Cl) with 4 g/L glycerol': ('Minimal', 'Glycerol', 4),
'Gutnick minimal complete medium (4.7 g/L KH2PO4; 13.5 g/L K2HPO4; 1 g/L K2SO4; 0.1 g/L MgSO4-7H2O; 10 mM NH4Cl) with 4 g/L acetate': ('Minimal', 'Acetate', 4),
'Luria-Bertani (LB) media': ('Complex', 'LB', np.nan),
'48 mM Na2HPO4, 22 mM KH2PO4, 10 mM NaCl, 45 mM (NH4)2SO4, supplemented with 1 mM MgSO4, 1 mg/l thiamine·HCl, 5.6 mg/l CaCl2, 8 mg/l FeCl3, 1 mg/l MnCl2·4H2O, 1.7 mg/l ZnCl2, 0.43 mg/l CuCl2·2H2O, 0.6 mg/l CoCl2·2H2O and 0.6 mg/l Na2MoO4·2H2O.  4 g/L Gluco': ('Minimal', 'Glucose', 4),
'4.0 g/L Na2SO4; 5.36 g/L (NH4)2SO4; 1.0 g/L NH4Cl; 7.3 g/L K2HPO4; 1.8 g/L NaH2PO4 H2O; 12.0 g/L (NH4)2-H-citrate; 4.0 mL/L MgSO4 (1 M); 6.0 mL/L trace element solution; 0.02 g/L thiamine, 20 g/L glucose': ('Minimal', 'Glucose', 20),
'0.2 g/L NH4Cl, 2.0 g/L (NH4)2SO4, 3.25 g/L KH2PO4, 2.5 g/L K2HPO4, 1.5 g/L NaH2PO4, 0.5 g/L MgSO4; trace substances: 10 mg/L CaCl2, 0.5 mg/L ZnSO4, 0.25 mg/L CuCl2, 0.25 mg/L  MnSO4, 0.175 mg/L CoCl2, 0.125 mg/L H3BO3, 2.5 mg/L AlCl3, 0.5 mg/L Na2MoO4, 10': ('Minimal', 'Glucose', 30),
'M9 Minimal Media, 4 g/L Glucose': ('Minimal', 'Glucose', 4),
'199 Medium with Earle’s salts –which contains 21 amino acids, 17 vitamins, 10 components of nucleic acids, sodium acetate, glucose, NaC1, KCl, CaC12, MgS04, Na2HP04, and Fe(N03)3': ('Complex', '199 Medium', np.nan)
}

In [129]:
temp = pd.DataFrame(all_conc.growth_media.map(media_categorization).to_list(), columns=['Minimal/complex', 'Carbon source', 'CS concentration [g/L]'])

In [128]:
all_conc.drop(columns=['Minimal/complex', 'Carbon source', 'CS concentration [g/L]'], inplace=True)

In [130]:
all_conc = all_conc.join(temp)

In [176]:
all_conc.to_csv(all_conc_fn)

# Add data from Park et al., 2016


In [77]:
park_fn = data_folder / 'park_2016' / 'park_2016_natchembio_concentrations.csv'
park_df = pd.read_csv(park_fn, skiprows=3)

In [78]:
name_to_park = met_info.set_index('Metabolite')['Park et al. name'].to_dict()

In [153]:
park_df.rename(columns={'Metabolite[compartment] \ Concentration(M)': 'Metabolite'}, inplace=True)

In [154]:
park_df.columns

Index(['Metabolite', 'BiGG ID', 'KEGG ID', 'Brenda ID', 'Mammalian iBMK',
       'L.B.', 'U.B.', 'Yeast', 'L.B..1', 'U.B..1', 'E. coli', 'L.B..2',
       'U.B..2'],
      dtype='object')

In [172]:
# Most metabolites in this spreadsheet are from Bennet et al., 2009, Nat Chem Biol https://doi.org/10.1038/nchembio.186
# These are also in ECMDB, but there are a few more values in this sheet to add
ref_text = 'Park et al., 2016, Nat Chem Biol'
metabolites_to_add_from_park = ['2-phosphoglycerate', 'Erythrose-4-phosphate','Glyceraldehyde-3-phosphate', 'Isocitrate', 'Oxaloacetate', 'Sedoheptulose 7-phosphate']
park_data = []
for mname in metabolites_to_add_from_park:
    pname = name_to_park[mname]
    conc  = float(park_df.loc[park_df.Metabolite==pname, 'E. coli'].values[0])
    lb = float(park_df.loc[park_df.Metabolite==pname, 'L.B..2'].values[0])
    ub = float(park_df.loc[park_df.Metabolite==pname, 'U.B..2'].values[0])
    error = ub - lb
    park_data.append([mname, 0, 'Gutnick minimal medium', 'Bioreactor', conc*1e6, 'uM', error*1e6, '37', 'NCM3722', 'Log-phase', None, None, ref_text, '27159581', None, 'Minimal', 'Glucose', 4])

In [173]:
park_df = pd.DataFrame(park_data, columns=all_conc.columns)

In [175]:
all_conc = pd.concat([all_conc, park_df], ignore_index=True).reset_index(drop = True)

# Add data from Thorfinsdottir et al.

In [251]:
thd_fn = data_folder / 'Thorfinnsdottir_2022/Supplementary Table S2 - Thorfinnsdottir et al.csv'
thd_df = pd.read_csv(thd_fn, skiprows=2)

In [252]:
thd_df.head()

Unnamed: 0.1,Unnamed: 0,Cultivation condition,Biological replica,Technical,Asn,Asp,ATP,cAMP,CDP,2/3-PG,...,Pro,PRPP,R5P,S7P,Ser,Suc,UDP-GlcNac,UMP,UTP,Val
0,Mineral,Bioreactor,A,1,0.00015,0.00023,0.0032,1.3e-05,0.00012,0.00038,...,0.00016,0.00012,0.00022,7e-05,0.00028,0.0018,0.00076,0.0032,0.0012,0.00023
1,Mineral,Bioreactor,A,2,8.6e-05,0.00015,0.0025,1.5e-05,0.00012,0.00023,...,0.00012,5e-05,0.00017,5.6e-05,0.00025,0.0014,0.00067,0.0025,0.00095,0.00016
2,Mineral,Bioreactor,A,3,0.00018,0.00016,0.0028,1.2e-05,0.00011,0.00028,...,0.0002,8e-05,0.00015,6.1e-05,0.00026,0.002,0.00079,0.0029,0.0012,0.00046
3,Mineral,Bioreactor,A,4,0.00014,0.00016,0.0025,1.1e-05,0.00014,0.00025,...,0.00016,5.5e-05,0.00017,5.4e-05,0.00024,0.0015,0.00072,0.0025,0.001,0.00036
4,Mineral,Bioreactor,B,1,0.0001,0.00016,0.0031,1.5e-05,0.00013,0.00033,...,0.00012,0.00011,0.00022,8.2e-05,0.00026,0.0022,0.00079,0.0031,0.0012,0.00016


In [253]:
thd_df.rename(columns={'Unnamed: 0': 'Medium'}, inplace=True)
thd_df.columns

Index(['Medium', 'Cultivation condition', 'Biological replica', 'Technical',
       'Asn', 'Asp', 'ATP', 'cAMP', 'CDP', '2/3-PG', '6PG', 'ADP', 'aKG',
       'Ala', 'AMP', 'Arg', 'dCDP', 'dCMP', 'dCTP', 'dGDP', 'cGMP', 'Cit',
       'CMP', 'CTP', 'Cys', 'dADP', 'dAMP', 'dATP', 'F6P', 'FAD', 'Fum',
       'G-/M-1P', 'dGMP', 'dGTP', 'dTDP', 'dTMP', 'dTTP', 'dUMP', 'F1,6BP',
       'F1P', 'GMP', 'GTP', 'His', 'ICit', 'G6P', 'GA6P', 'GAL1P', 'GDP',
       'GL3P', 'Gln', 'Glu', 'Gly', 'NAD', 'NADH', 'NADP', 'NADPH', 'Ile',
       'IMP', 'ITP', 'Leu', 'Lys', 'M6P', 'Mal', 'Met', 'Thr', 'Trp', 'Tyr',
       'UDP', 'PEP', 'Phe', 'Pro', 'PRPP', 'R5P', 'S7P', 'Ser', 'Suc',
       'UDP-GlcNac', 'UMP', 'UTP', 'Val'],
      dtype='object')

In [254]:
thd_meta_columns = list(thd_df.columns[:4])

In [11]:
strain = 'K12 MG1655'
conditions = {('Mineral', 'Bioreactor'): ['Mineral medium', 'Bioreactor', 'Minimal', 'Glucose', 10],
              ('Mineral', 'Shake flask'): ['Mineral medium', 'Shake flask', 'Minimal', 'Glucose', 4],
              ('Rich', 'Shake flask'): ['Rich medium', 'Shake flask','Complex', 'Rich', np.nan]
}
ref_text = 'Thorfinnsdottir et al., 2023, Metabolites'
cs_conc = 10
# 'Log-phase'
td_data = []
for i, row in met_info.iterrows():
    abbrv = row['Thorfinnsdottir et al. abbrv']
    if isinstance(abbrv, str):
        conc_dfi = thd_df.loc[:, thd_meta_columns + [abbrv]]
        tech_mean = conc_dfi.groupby(['Medium','Cultivation condition', 'Biological replica']).agg({abbrv:('mean')})
        bio_mean = tech_mean.groupby(['Medium','Cultivation condition']).agg({abbrv:('mean', 'std','sem', 'count')}).reset_index()
        for j in range(len(bio_mean)):
            medium, system, mincomp, cs, cs_conc = conditions[bio_mean.at[j, ('Medium','')], bio_mean.at[j, ('Cultivation condition','')]]
            # 1e6 to convert to uM
            error = 1.96*bio_mean.at[j, (abbrv,'sem')]*1e6
            td_data.append([row['Metabolite'], j, medium, system, bio_mean.at[j, (abbrv, 'mean')]*1e6, 'uM', error, '37', strain, 'Log-phase',  None, None, ref_text,'36837769', None, mincomp, cs, cs_conc])
            

NameError: name 'thd_df' is not defined

In [303]:
df_td = pd.DataFrame(td_data, columns=all_conc.columns)

In [304]:
all_conc = pd.concat([all_conc, df_td], ignore_index=True).reset_index(drop = True)

In [308]:
all_conc.to_csv(all_conc_fn)

# 

# Add data from this work

In [159]:
all_conc = pd.read_csv(all_conc_fn, index_col=0)

In [35]:
all_conc.sort_values(by='Metabolite', inplace=True)

In [41]:
# all_conc.drop(columns=['#', 'internal', 'molecules', 'molecules_error']).reset_index(drop=True).to_csv(all_conc_fn)


In [9]:
intra_fn = data_folder / 'this_project/4_paired_metabolomics_live_dead/4B_intracellular_conc_nmol_per_gprotein.csv'
df_intra = pd.read_csv(intra_fn, index_col=0)
# df_intra.sort_values(by=)

# Need to convert nmol per g to uM

In [42]:
protein_density = 13.5*1e-8#  μg/μm^3 From Mori et al., 2017 (See SI) https://www.nature.com/articles/s41467-017-01242-8
# Protein density seems to be not correlated with growth rate

cubic_um_to_L = 1e-15 # L/um^3
protein_denisty_g_per_L = protein_density/cubic_um_to_L*1e-6

In [23]:
protein_denisty_g_per_L

135.0

In [27]:
df_intraL = df_intra.melt(id_vars=df_intra.columns[:4], var_name='Metabolite', value_name='Concentration [nmol/g protein]')

In [31]:
df_intraL['Concentration [uM]'] = df_intraL['Concentration [nmol/g protein]']*protein_denisty_g_per_L*1e-3

In [48]:
all_conc.columns

Index(['Metabolite', 'growth_media', 'growth_system', 'concentration',
       'concentration_units', 'error', 'temperature', 'strain',
       'growth_status', 'reference_text', 'pubmed_id', 'Minimal/complex',
       'Carbon source', 'CS concentration [g/L]'],
      dtype='object')

In [64]:
df_intra_mean = df_intraL[['Concentration [uM]', 'Carbon source', 'Metabolite']].groupby(['Carbon source', 'Metabolite']).agg(['mean', 'sem', 'std']).reset_index()

In [67]:
df_intra_mean.columns = ['Carbon source', 'Metabolite', 'mean', 'sem', 'std']

In [69]:
strain = 'K12 MG1655'
mal_mol_weight = 134.09
ala_mol_weight = 89.09
gal_mol_weight = 180.156

concentration = {
    'Galactose': 20*gal_mol_weight*1e-3,
    'L-malate': 30*mal_mol_weight*1e-3,
    'L-alanine': 40*ala_mol_weight*1e-3
}

conditions = [37, strain, 'Log-phase', 'This work', "", 'Minimal']
this_data = []
for i, row in df_intra_mean.iterrows():
    conc = concentration[row['Carbon source']]
    error = row['sem']*1.96
    this_data.append([row['Metabolite'], 'M9', 'Shake flask', row['mean'], 'uM', error]+conditions+[row['Carbon source'], conc])


In [70]:
this_df = pd.DataFrame(this_data, columns=all_conc.columns)

In [72]:
all_conc = pd.concat([all_conc, this_df], ignore_index=True).reset_index(drop = True)

In [73]:
all_conc.sort_values(by='Metabolite', inplace=True)


In [74]:
all_conc.to_csv(all_conc_fn)

# Add error estimates from Bennet et al, for those data points missing that info from ECMDB

In [187]:
bennet_fn = '../../data/bennet_2009/intracellular_concentrations.csv'
bennet_df = pd.read_csv(bennet_fn)

In [192]:
name_to_bennet = {key:value.rstrip('[c]*') for key, value in name_to_park.items() if isinstance(value, str)}
name_to_bennet['Cis-aconitate']='aconitate'
name_to_bennet['Trans-aconitate']='aconitate'
name_to_bennet['Leucine'] = 'isoleucine+leucine'
name_to_bennet['Isoleucine'] = 'isoleucine+leucine'
name_to_bennet['NAD']='NAD'
name_to_bennet['Ribose-5-phosphate'] = 'pentose-P'
name_to_bennet['Ribulose-5-phosphate'] = 'pentose-P'
name_to_bennet['Xylulose-5-phosphate'] = 'pentose-P'

In [189]:
# park_to_name = met_info.set_index('Park et al. name')['Metabolite'].to_dict()
# park_to_name = {key.rstrip('[c]*'): value for key, value in park_to_name.items() if isinstance(key, str)}

In [193]:
for i, row in all_conc.iterrows():
    if row['reference_text'].lower()[:7]=='bennett':
        bname = name_to_bennet[row['Metabolite']]
        if row['error']==0:
            idx = bennet_df.Metabolite==bname
            if np.sum(idx) == 0:
                print(row)
                break
            conc_bennet = bennet_df.loc[idx, row['Carbon source']].values[0]*1e6
            
            if bname in ['isoleucine+leucine', 'aconitate']:
                conc_bennet = conc_bennet/2.0
            elif bname=='pentose-P':
                conc_bennet = conc_bennet/3.0
            if np.round(conc_bennet, 3)!=row['concentration']:
                    print(f'Set {bname} to {conc_bennet}')
                    all_conc.at[i, 'concentration'] = conc_bennet

            error = (bennet_df.loc[idx, f"{row['Carbon source']}-UB"].values[0]-bennet_df.loc[idx, row['Carbon source']].values[0])*1e6
            if bname in ['isoleucine+leucine', 'aconitate']:
                error /= 2
            elif bname=='pentose-P':
                error = error/3.0
            all_conc.at[i, 'error'] = error


Set pentose-P to 228.66666666666666
Set pentose-P to 340.00000000000006
Set pentose-P to 440.0


In [194]:
all_conc.to_csv(all_conc_fn)