## This notebook prepares input for IPath3 visualisation

In [1]:
from pathlib import Path
from definitions import ROOT_DIR
import pandas as pd
import numpy as np

- Inputs

In [2]:
p_root_dir = Path(ROOT_DIR).parent
p_analysis = p_root_dir / "6_plots\q1_plots"

# Compunds used in the study
p_compounds = p_root_dir / "5_data_analysis/compounds_ids.csv"

# HMDB ID to KEGG ID mapping
p_kegg = p_analysis / "hmdb_kegg_2020-09-09.csv"

- Load data

In [3]:
compounds = pd.read_csv(p_compounds, index_col='internal_id')
# Remove entries for thermometer ions and fiducials
compounds = compounds[compounds.hmdb_primary != 'custom'][['name_short', 'hmdb_primary']]

kegg = pd.read_csv(p_kegg)

- Map KEGG IDs to the compounds
- Note which compounds have no mapping

In [4]:
df = pd.merge(compounds, kegg, how='left', on='hmdb_primary')

In [5]:
unmapped_compounds = df.loc[df.kegg_id != df.kegg_id, 'name_short']
pd.Series(pd.Series(),index=unmapped_compounds).to_dict()

  pd.Series(pd.Series(),index=unmapped_compounds).to_dict()


{'N-Acetylgalactosamine 6-phosphate': nan,
 'Cholesteryl acetate': nan,
 'TG 15:0-18:1-15:0': nan,
 'MG 18:1': nan,
 'Cardiolipin 18:1': nan,
 'DG 18:0-22:6': nan,
 'PS (POPS) 16:0-18:1': nan,
 'PG 16:0-18:1': nan,
 'Lyso PI 17:1': nan,
 'PI 16:0-18:1': nan,
 'PE 18:0-20:4': nan,
 'Lyso PE 18:0': nan,
 'Lyso PA 18:1': nan,
 'Lyso PG 16:0': nan,
 'Lyso PS 17:1': nan,
 'SM d18:1-16:0': nan,
 'PC (O) C16-18:1': nan,
 'Cholesteryl ester 17:0': nan}

In [6]:
mapping = {'N-Acetylgalactosamine 6-phosphate': 'C06376',
 'Cholesteryl acetate': 'C02530',
 'TG 15:0-18:1-15:0': 'C00422',
 'MG 18:1': 'C01885',
 'Cardiolipin 18:1': 'C05980',
 'DG 18:0-22:6': 'C00165',
 'PS (POPS) 16:0-18:1': 'C02737',
 'PG 16:0-18:1': 'C00344',
 'Lyso PI 17:1': 'C03819',
 'PI 16:0-18:1': 'C01194',
 'PE 18:0-20:4': 'C00350',
 'Lyso PE 18:0': 'C04438',
 'Lyso PA 18:1': 'C00681',
 'Lyso PG 16:0': 'C18126',
 'Lyso PS 17:1': 'C18125',
 'SM d18:1-16:0': 'C00550',
 'PC (O) C16-18:1': 'C05212',
 'Cholesteryl ester 17:0': 'C02530'}

- Complete the mapping manually

In [7]:
for x in mapping.keys():
    df.loc[df.name_short == x, 'kegg_id'] = mapping[x]

- Print and copy KEGG IDs to IPath3 website [https://pathways.embl.de/ipath3.cgi?map=metabolic](https://pathways.embl.de/ipath3.cgi?map=metabolic)

In [13]:
ids = [print(f"{x} #000000 W16") for x in df.kegg_id]

C00221 #000000 W16
C00092 #000000 W16
C05378 #000000 W16
C00111 #000000 W16
C00597 #000000 W16
C00074 #000000 W16
C00022 #000000 W16
C00024 #000000 W16
C00186 #000000 W16
C00345 #000000 W16
C00199 #000000 W16
C03736 #000000 W16
C00184 #000000 W16
C00257 #000000 W16
C00508 #000000 W16
C00158 #000000 W16
C00417 #000000 W16
C00026 #000000 W16
C00042 #000000 W16
C00122 #000000 W16
C00149 #000000 W16
C00036 #000000 W16
C00029 #000000 W16
C00167 #000000 W16
C00352 #000000 W16
C06376 #000000 W16
C00043 #000000 W16
C00140 #000000 W16
C00137 #000000 W16
C01041 #000000 W16
C01177 #000000 W16
C00249 #000000 W16
C00418 #000000 W16
C00187 #000000 W16
C04025 #000000 W16
C03761 #000000 W16
C00154 #000000 W16
C00318 #000000 W16
C00695 #000000 W16
C00037 #000000 W16
C05122 #000000 W16
C00951 #000000 W16
C00152 #000000 W16
C00049 #000000 W16
C01042 #000000 W16
C00064 #000000 W16
C00025 #000000 W16
C12270 #000000 W16
C00334 #000000 W16
C00079 #000000 W16
C00082 #000000 W16
C00355 #000000 W16
C03758 #0000

- Save results

In [9]:
df.to_csv(p_analysis / 'compound_kegg_mapping.csv')