In [3]:
import pandas as pd

In [4]:
# Parsing likely specific to version of metadata; new metadata may require different parsing logic.
def _parse_afir(strain_description):
    afir = ''
    if '.' in strain_description:
        if ('BmePgi' in strain_description) or ('PaePgi' in strain_description):
            afir = strain_description[strain_description.rfind(' ')+1:]
        else:
            afir =  strain_description[strain_description.find('.')+1:]
    
    return afir


def _parse_ale_exp(strain_description):
    ale_exp = ''
    if '.' in strain_description:
        if ('BmePgi' in strain_description) or ('PaePgi' in strain_description):
            ale_exp = strain_description[:strain_description.rfind(' ')]
        else:
            ale_exp =  strain_description[:strain_description.find('.')]
    
    ale_exp = ale_exp.replace('Escherichia coli ', '')
    return ale_exp


# CCK ALE experiment starting strains that only have GLU ALE4 variants.
samples_to_remove = [
    'Escherichia coli ERS.ALE4',
    'Escherichia coli GLU.4.403.1 del_gnd',
    'Escherichia coli GLU.4.403.1 del_pgi',
    'Escherichia coli GLU.4.403.1 del_ptsH del_ptsI del_crr',
    'Escherichia coli GLU.4.403.1 del_sdhC del_sdhB',
    'Escherichia coli GLU.4.403.1 del_tpiA',
 ]


imoddb_to_aledb_exp_name_d = {
    'OxyR': 'Iron-ALE',
    'BmePgi': 'pgiBME',
    'PaePgi': 'pgiPAE',
    'ERS': 'ROS_eAMP',
}


imoddb_ale_md_df = pd.read_csv('./data/imodulondb/aledb_sample_table.csv', index_col=0)  # aledb_sample_table.csv was manually curated to only include ALEdb samples.
imoddb_ale_md_df = imoddb_ale_md_df[~imoddb_ale_md_df['Strain Description'].isin(samples_to_remove)]
imoddb_ale_md_df['afir'] = imoddb_ale_md_df['Strain Description'].apply(_parse_afir)
imoddb_ale_md_df = imoddb_ale_md_df[imoddb_ale_md_df['afir'] != '']  # Just going to ignore all others that can't be parsed|
imoddb_ale_md_df['ALE exp'] = imoddb_ale_md_df['Strain Description'].apply(_parse_ale_exp)
imoddb_ale_md_df['ALE exp'] = imoddb_ale_md_df['ALE exp'].apply(lambda s: imoddb_to_aledb_exp_name_d[s] if s in imoddb_to_aledb_exp_name_d.keys() else s)
assert(len(imoddb_ale_md_df[imoddb_ale_md_df['ALE exp'] == '']) == 0)
imoddb_ale_md_df
set(imoddb_ale_md_df['ALE exp'])

{'42C',
 'C13',
 'CCK',
 'CCK_GND',
 'CCK_PGI',
 'CCK_PTS',
 'CCK_SDH',
 'CCK_TPIA',
 'EEP',
 'EEP_menF',
 'EEP_menF_entC',
 'EEP_menF_entC_ubiC',
 'EEP_menFubiC',
 'EEP_pdhR-13',
 'EEP_pdhR-14',
 'EEP_pdhR-15',
 'EEP_pdhR-16',
 'EEP_ubiC',
 'FPS ptsI',
 'FPS serB',
 'FPS thrA',
 'GLU',
 'HOT_mutL',
 'Iron-ALE',
 'PAL',
 'ROS_eAMP',
 'SSW_GLU_AC',
 'SSW_GLU_GLY',
 'SSW_GLU_XYL',
 'SSW_GLY',
 'SSW_XYL',
 'pgiBME',
 'pgiPAE'}

In [5]:
imoddb_ale_md_df[['ale', 'flask', 'isolate']] = imoddb_ale_md_df['afir'].apply(lambda s: pd.Series(s.split('.')))
imoddb_ale_md_df.fillna('', inplace=True)
imoddb_ale_md_df.head()

Unnamed: 0,sample,study,project,condition,rep_id,Strain Description,Strain,Culture Type,Evolved Sample,Base Media,...,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,afir,ALE exp,ale,flask,isolate
p1k_00125,glu__glu_ale3__1,Glucose Evolution,glu,glu_ale3,1,Escherichia coli GLU.3.412.1,MG1655,Batch,Endpoint,M9,...,,,,,,3.412.1,GLU,3,412,1
p1k_00126,glu__glu_ale4__1,Glucose Evolution,glu,glu_ale4,1,Escherichia coli GLU.4.403.1,MG1655,Batch,Endpoint,M9,...,,,,,,4.403.1,GLU,4,403,1
p1k_00127,glu__glu_ale5__1,Glucose Evolution,glu,glu_ale5,1,Escherichia coli GLU.5.288.1,MG1655,Batch,Endpoint,M9,...,,,,,,5.288.1,GLU,5,288,1
p1k_00128,glu__glu_ale6__1,Glucose Evolution,glu,glu_ale6,1,Escherichia coli GLU.6.406.1,MG1655,Batch,Endpoint,M9,...,,,,,,6.406.1,GLU,6,406,1
p1k_00129,glu__glu_ale8__1,Glucose Evolution,glu,glu_ale8,1,Escherichia coli GLU.8.380.1,MG1655,Batch,Endpoint,M9,...,,,,,,8.380.1,GLU,8,380,1


In [6]:
imoddb_ale_md_df.to_pickle('./data/imodulondb/imoddb_ale_md_df.pkl')