In [10]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Adding protein abundance data
This paper has absolute protein abundance data for different E. coli strains in different conditions. The goal of this notebook is to extract protein abundance data and Ribo-Seq data for E. coli grown in glucose M9 (or similar) to get an idea of which transporters that are constitutively expressed 


In [11]:
# Data
folder = Path('/Users/ssulheim/Library/CloudStorage/OneDrive-UniversitédeLausanne/UNIL/leakage/data/mori_msb_2021_proteome_ecoli')
fn2 = folder / 'msb20209536-sup-0003-datasetev2.xlsx' # Sample info
fn3 = folder / 'msb20209536-sup-0004-datasetev3.xlsx' # Sample info
fn6 = folder / 'msb20209536-sup-0007-datasetev6.xlsx' # Data for "calibration samples on E. coli in MOPS medium"
fn8 = folder / 'msb20209536-sup-0009-datasetev8.xlsx' # Data for samples listed in fn2
fn9 = folder / 'msb20209536-sup-0010-datasetev9.xlsx' # Data for samples listed in fn3



selected_transporters = Path('/Users/ssulheim/Library/CloudStorage/OneDrive-UniversitédeLausanne/UNIL/leakage/experimental work/KEIO_transporter_knockouts/selected_transporters.xlsx')
df_sel = pd.read_excel(selected_transporters, skiprows=4, usecols='A:Q')
# df_sel = pd.read_excel(selected_transporters, sheet_name='Table EV1A', skiprows=3, usecols='A:H').iloc[1:]


In [12]:
df6 = pd.read_excel(fn6, sheet_name='EV6-CalibrationSamplesProteins', header = [0,1])
df8 = pd.read_excel(fn8, sheet_name='EV8-AbsoluteMassFractions-1')
df9 = pd.read_excel(fn9, sheet_name='EV9-AbsoluteMassFractions-2')


In [13]:
df6.rename(columns={"Unnamed: 0_level_1":'',	"Unnamed: 1_level_1":'',	"Unnamed: 2_level_1":'',	"Unnamed: 3_level_1":'', "Unnamed: 4_level_1":''}, level = 1, inplace=True)
df6.columns = ['_'.join(col) if len(col[1]) else col[0] for col in df6.columns]

In [14]:
xtop1_cols = [x for x in df6.columns if 'xTop' in x]

In [15]:
# idx = np.where(df6['Gene locus'] == gene_name)[0][0]

In [16]:
df6_drop = ['TopPep1_A1-1', 'TopPep1_A1-2', 'TopPep1_A1-3', 'TopPep1_C1',
       'TopPep1_F1-1', 'TopPep1_F1-2', 'TopPep1_F1-3', 'TopPep3_A1-1',
       'TopPep3_A1-2', 'TopPep3_A1-3', 'TopPep3_C1', 'TopPep3_F1-1',
       'TopPep3_F1-2', 'TopPep3_F1-3', 'iBAQ_A1-1', 'iBAQ_A1-2', 'iBAQ_A1-3',
       'iBAQ_C1', 'iBAQ_F1-1', 'iBAQ_F1-2', 'iBAQ_F1-3']
df6.drop(columns=df6_drop, inplace=True)

In [17]:
gene_locus_to_weight = df6.set_index('Gene locus')['Molecular weight (kDa)'].to_dict()

In [18]:
# Convert from mass fractions to number fractions
total_mass = {x: np.sum(df6[x]/df6['Molecular weight (kDa)']) for x in xtop1_cols}
for key, value in total_mass.items():
    df6[key] = (df6[key]/df6['Molecular weight (kDa)'])/value

In [19]:
selected_cols_df8 = ["Lib-24", "Lib-25", "Lib-26", "Lib-27", "Lib-28", "Lib-29", "Lib-30", "Lib-06"] # "Lib-00-A1", "Lib-00-A2", "Lib-00-A3", "Lib-00-B1", "Lib-00-B2", "Lib-00-B3"

selected_cols_df9 = ["A1-1", "A1-2", "A1-3", "C1", "F1-1", "F1-2", "F1-3", "C2", "C3", "C4", "C5", "C6", "C7", "C8", "D6", "D7", "D8", "F4", "F5", "F6", "F7", "F8"]

In [20]:
#Convert df8 to number fractions
df8['Molecular weight (kDa)'] = df8['Gene locus'].map(gene_locus_to_weight)
df9['Molecular weight (kDa)'] = df9['Gene locus'].map(gene_locus_to_weight)

total_mass_8 = {x: np.sum(df8[x]/df8['Molecular weight (kDa)']) for x in selected_cols_df8}
total_mass_9 = {x: np.sum(df9[x]/df9['Molecular weight (kDa)']) for x in selected_cols_df9}
for key, value in total_mass_8.items():
    df8[key] = (df8[key]/df8['Molecular weight (kDa)'])/value
    
for key, value in total_mass_9.items():
    df9[key] = (df9[key]/df9['Molecular weight (kDa)'])/value



In [21]:
df8_all =  ['Lib-01', 'Lib-02', 'Lib-03','Lib-04', 'Lib-05', 'Lib-06', 'Lib-07', 'Lib-08', 'Lib-09', 'Lib-10',
       'Lib-11', 'Lib-12', 'Lib-13', 'Lib-14', 'Lib-15', 'Lib-16', 'Lib-17',
       'Lib-18', 'Lib-19', 'Lib-20', 'Lib-21', 'Lib-22', 'Lib-23', 'Lib-24',
       'Lib-25', 'Lib-26', 'Lib-27', 'Lib-28', 'Lib-29', 'Lib-30']

In [22]:
df9_all = ['A1-1', 'A1-2', 'A1-3', 'C1',
       'F1-1', 'F1-2', 'F1-3', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'D6',
       'D7', 'D8', 'F4', 'F5', 'F6', 'F7', 'F8', 'D1', 'D2', 'D3', 'D4', 'D5',
        'F2', 'F3', 'A2', 'E1', 'E2', 'E3', 'E4', 'H1', 'H5']

In [25]:
# values8 = df8.loc[idx8, selected_cols_df8]

In [26]:
mean_protein_number_fraction = []
std_protein_number_fraction = []
detected_in_all_c_limit = []
detected_in_all = []
ribo_seq = []
for i, row in df_sel.iterrows():
    try:
        idx6 = np.where(df6['Gene locus']==row['Blattner ID'])[0][0]
        idx8 = np.where(df8['Gene locus']==row['Blattner ID'])[0][0]
        idx9 = np.where(df8['Gene locus']==row['Blattner ID'])[0][0]
    except IndexError:
        mean_protein_number_fraction.append(None)
        std_protein_number_fraction.append(None)
        detected_in_all_c_limit.append(None)
        detected_in_all.append(None)
        ribo_seq.append(None)
    else:
        values6 = df6.loc[idx6, xtop1_cols].values
        values8 = df8.loc[idx8, selected_cols_df8].values
        values9 = df9.loc[idx9, selected_cols_df9].values
        values = np.concatenate([values6, values8, values9])
        mean_protein_number_fraction.append(np.mean(values))
        std_protein_number_fraction.append(np.std(values))
        detected_in_all_c_limit.append(np.sum(values>0)/len(values))
        all_values = np.concatenate([df6.loc[idx6, xtop1_cols].values, 
                                     df8.loc[idx8, df8_all].values,
                                     df9.loc[idx9, df9_all].values])
        detected_in_all.append(np.sum(all_values>0)/len(all_values))
            
        ribo_seq.append(df6.loc[idx6, 'Ribosome profiling (Li et al., 2014)'])

In [27]:
df_sel['Mean protein number fraction'] = mean_protein_number_fraction
df_sel['Std protein number fraction'] = std_protein_number_fraction
df_sel['Detected in fraction of all conditions'] = detected_in_all
df_sel['Detected in fraction of all C-limit conditions'] = detected_in_all_c_limit
df_sel['Ribosome profiling (Li et al., 2014)'] = ribo_seq

In [28]:
df_sel.tail()

Unnamed: 0,Sample Number,JW ID,Blattner ID,Gene Name,Annotation,Growth rate in glucose + AA medium,Selection score (0-3),Location,Transport,Class,...,TCID,biocyc link,Comment,Paper,Location in operon,Mean protein number fraction,Std protein number fraction,Detected in fraction of all conditions,Detected in fraction of all C-limit conditions,"Ribosome profiling (Li et al., 2014)"
103,104.0,JW4038,b4077,gltP,glutamate/aspartate:proton symporter,0.86,3.0,Inner membrane,"L-aspartate, L-glutamate",dicarboxylate/amino acid:cation symporter (DAA...,...,,,GltP accounts for approximately 60% of the tot...,,,0.0,0.0,0.0,0.0,6813924.0
104,105.0,JW0009,b0010,satP,acetate/succinate:H+ symporter,0.64,3.0,Inner membrane,"Acetate, succinate",Acetate Uptake Transporter (AceTr),...,,https://biocyc.org/gene?orgid=ECOLI&id=EG11512,,,,0.0,0.0,0.0,0.0,889694.0
105,106.0,JW1718,b1729,tcyP,cystine/sulfocysteine:cation symporter,0.83,1.0,Inner membrane,L-cystine,dicarboxylate/amino acid:cation symporter (DAA...,...,,https://biocyc.org/gene?orgid=ECOLI&id=G6934,,,,0.000423,0.000192,1.0,1.0,128468400.0
106,107.0,JW2910,b2943,galP,D-galactose transporter,0.84,2.0,Inner membrane,"Galactose, D-glucose",MFS,...,,https://biocyc.org/gene?orgid=ECOLI&id=EG12148,,,,0.0,0.0,0.0,0.0,11495660.0
107,,JW1652,b1660,punC,predicted transporter,0.76,3.0,Inner membrane,Purines,MFS,...,,https://ecocyc.org/gene?orgid=ECOLI&id=YDHC-MO...,While the punC deletion in general has a negat...,https://pubmed.ncbi.nlm.nih.gov/34413462/,,0.0,0.0,0.0,0.0,2022080.0


In [29]:
df_sel['log10(Mean protein number fraction)'] = np.log10(df_sel['Mean protein number fraction'])
df_sel['log10(Ribosome profiling)'] = np.log10(df_sel['Ribosome profiling (Li et al., 2014)'])

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [30]:
df_sel.drop(columns=['Location in operon'], inplace=True)
df_sel.replace(-np.inf, np.nan, inplace=True)

In [31]:
fn_new = '/Users/ssulheim/Library/CloudStorage/OneDrive-UniversitédeLausanne/UNIL/leakage/experimental work/KEIO_transporter_knockouts/selected_transporters_with_exp.xlsx'
# fn_new = '/Users/ssulheim/Library/CloudStorage/OneDrive-UniversitédeLausanne/UNIL/leakage/experimental work/KEIO_transporter_knockouts/keio_KO_with_proteome_data.xlsx'
df_sel.to_excel(fn_new, index=False)

In [420]:
# df_sel['biocyc link'] = df_sel['biocyc link'].apply(lambda x: f'=HYPERLINK("{x}", "{str(x).split('=')[-1]}")')

In [327]:
df_sel

Unnamed: 0,Sample Number,JW ID,Blattner ID,Gene Name,Annotation,Growth rate in glucose + AA medium,Selection score (0-3),Location,Transport,Class,...,TCID,biocyc link,Comment,Paper,Location in operon,Mean protein number fraction,Std protein number fraction,Detected in fraction of all conditions,Detected in fraction of all C-limit conditions,"Ribosome profiling (Li et al., 2014)"
0,1,JW0401,b0411,tsx,"nucleoside channel, receptor of phage T6 and c...",0.92,1.0,Outer membrane,Nucleoside,Porin,...,,"=HYPERLINK(""https://biocyc.org/gene?orgid=ECOL...",,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,0.000576,0.000149,1.000000,1.000000,1.524697e+08
1,2,JW2203,b2215,ompC,outer membrane porin protein C,0.89,3.0,Outer membrane,Unspecific,Porin,...,,"=HYPERLINK(""https://biocyc.org/gene?orgid=ECOL...",OpmC is reciprocally regulated with ompF. Both...,10.1074/jbc.M602112200,,0.004423,0.004462,1.000000,1.000000,2.483005e+09
2,3,JW0912,b0929,ompF,outer membrane porin 1a (Ia;b;F),0.77,2.0,Outer membrane,Unspecific,Porin,...,,"=HYPERLINK(""https://biocyc.org/gene?orgid=ECOL...",,10.1074/jbc.M602112200,,0.010954,0.003789,1.000000,1.000000,3.287487e+09
3,4,JW1469,b1473,yddG,aromatic amino acid exporter,0.78,3.0,Inner membrane,broad range of amino acids,DMT,...,https://www.tcdb.org/search/result.php?tc=2.A....,"=HYPERLINK(""https://biocyc.org/gene?orgid=ECOL...",Overexpression of YddG increases accumulation ...,https://academic.oup.com/femsle/article/275/2/...,,0.000000,0.000000,0.000000,0.000000,3.161188e+06
4,5,JW0231,b0241,phoE,outer membrane phosphoporin protein E,0.86,1.0,Outer membrane,Phosphor related compounds,Porin,...,https://tcdb.org/search/result.php?tc=1.b.1.1.2,"=HYPERLINK(""https://biocyc.org/gene?orgid=ECOL...",Less than 600 Dalton (same as OmpC and OmpF),,,0.000038,0.000070,0.602740,0.702703,1.807160e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,92,JW0665,b0679,nagE,fused N-acetyl glucosamine specific PTS enzyme...,0.80,1.0,Inner membrane,N-acetyl glucosamine,PTS,...,,"=HYPERLINK(""https://biocyc.org/gene?orgid=ECOL...",,,,0.000093,0.000047,1.000000,1.000000,1.702147e+07
92,93,JW2900,b2933,cmtA,predicted fused mannitol-specific PTS enzymes:...,0.86,0.0,Inner membrane,mannitol,PTS,...,,"=HYPERLINK(""https://biocyc.org/gene?orgid=ECOL...",,,,0.000000,0.000000,0.000000,0.000000,0.000000e+00
93,94,JW0069,b0070,setA,broad specificity sugar efflux system,0.82,2.0,Inner membrane,"Lactose, glucose",MFS,...,,"=HYPERLINK(""https://biocyc.org/gene?orgid=ECOL...",,,,0.000000,0.000000,0.000000,0.000000,7.455000e+05
94,95,JW3898,b3927,glpF,glycerol facilitator,0.94,4.0,Inner membrane,"Glycerol, glyceraldehyde, glycine, urea",major intrinsic protein (MIP) family,...,,"=HYPERLINK(""https://biocyc.org/gene?orgid=ECOL...",glpF mutants have impaired growth on low conce...,,,0.000007,0.000038,0.164384,0.081081,2.782800e+05


In [143]:
df8[selected_cols_df8]

Unnamed: 0,Lib-24,Lib-25,Lib-26,Lib-27,Lib-28,Lib-29,Lib-30,Lib-06
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000075,0.000076,0.000083,0.000066,0.000077,0.000068,0.000057,0.000061
...,...,...,...,...,...,...,...,...
4337,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4338,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4339,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4340,0.000000,0.000015,0.000004,0.000018,0.000016,0.000017,0.000000,0.000015


In [149]:
'Ribosome profiling (Li et al., 2014)'

'Ribosome profiling (Li et al., 2014)'

In [123]:
df_sel.head(4)

Unnamed: 0,Sample Number,JW ID,Blattner ID,Gene Name,Annotation,Growth rate in glucose + AA medium,Selection score (0-3),Location,Transport,Class,Mechanism/specific type,Expected direction,TCID,biocyc link,Comment,Constitutively expressed,Paper
0,1,JW0401,b0411,tsx,"nucleoside channel, receptor of phage T6 and c...",,,Outer membrane,Nucleoside,Channel,Passive diffusion?,Both?,,,,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
1,2,JW2203,b2215,ompC,outer membrane porin protein C,0.89,,Outer membrane,Unspecific,Porin,Passive diffusion,Both,,,OpmC is reciprocally regulated with ompF. Both...,,10.1074/jbc.M602112200
2,3,JW0912,b0929,ompF,outer membrane porin 1a (Ia;b;F),0.77,,Outer membrane,Unspecific,Porin,Passive diffusion,Both,,,,,10.1074/jbc.M602112200
3,4,JW1469,b1473,yddG,aromatic amino acid exporter,0.78,,Inner membrane,broad range of amino acids,DMT,Uniport,Export,https://www.tcdb.org/search/result.php?tc=2.A....,,Overexpression of YddG increases accumulation ...,,https://academic.oup.com/femsle/article/275/2/...


1.0

In [87]:
protein_weigth = df6.loc[idx, 'Molecular weight (kDa)']
np.mean(df6.loc[idx, xtop1_cols].values)/protein_weigth

1.817482798713675e-05

In [37]:
gene_name = 'b0411'
df6.where(df6['Gene locus']==gene_name)

ValueError: cannot join with no overlapping index names

In [148]:
df6

Unnamed: 0,Gene name,Gene locus,Protein ID,Molecular weight (kDa),"Ribosome profiling (Li et al., 2014)",xTop_A1-1,xTop_A1-2,xTop_A1-3,xTop_C1,xTop_F1-1,xTop_F1-2,xTop_F1-3
0,aaeA,b3241,P46482,40.265,201325.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,aaeB,b3240,P46481,85.277,170554.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,aaeR,b3243,P67662,40.020,3161580.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,aaeX,b3242,P46478,9.025,45125.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,aas,b2836,P31119,93.530,7482400.0,0.000016,0.000017,0.000017,0.000017,0.000017,0.000016,0.000017
...,...,...,...,...,...,...,...,...,...,...,...,...
4337,zraR,b4004,P14375,56.259,1687770.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4338,zraS,b4003,P14377,59.325,237300.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4339,zupT,b3040,P0A8H3,31.061,6740237.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4340,zur,b4046,P0AC51,22.292,3722764.0,0.000061,0.000000,0.000059,0.000061,0.000060,0.000071,0.000054
