## This notebook prepares input for IPath3 visualisation

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

- Inputs

In [2]:
p_root_dir = Path.cwd().parents[1]
p_analysis = p_root_dir / r"6_figures/figure_2/IPath3"

# Compunds used in the study
p_compounds = p_root_dir / r"5_data/metadata/compounds_ids.csv"

# HMDB ID to KEGG ID mapping
p_kegg = p_analysis / "hmdb_kegg_2020-09-09.csv"

# Chemical classification
p_chem_class = p_root_dir / r"5_data/custom_classification/custom_classification.csv"

- Load data

In [3]:
compounds = pd.read_csv(p_compounds)
# Remove entries for thermometer ions and fiducials
compounds = compounds[compounds.hmdb_primary != 'custom'][['name_short', 'hmdb_primary', 'internal_id']]

kegg = pd.read_csv(p_kegg)

- Map KEGG IDs to the compounds
- Note which compounds have no mapping

In [4]:
df = pd.merge(compounds, kegg, how='left', on='hmdb_primary')

In [5]:
unmapped_compounds = df.loc[df.kegg_id != df.kegg_id, 'name_short']
pd.Series(pd.Series(),index=unmapped_compounds).to_dict()

  pd.Series(pd.Series(),index=unmapped_compounds).to_dict()


{'N-Acetylgalactosamine 6-phosphate': nan,
 'Cholesteryl acetate': nan,
 'TG 15:0-18:1-15:0': nan,
 'MG 18:1': nan,
 'Cardiolipin 18:1': nan,
 'DG 18:0-22:6': nan,
 'PS (POPS) 16:0-18:1': nan,
 'PG 16:0-18:1': nan,
 'Lyso PI 17:1': nan,
 'PI 16:0-18:1': nan,
 'PE 18:0-20:4': nan,
 'Lyso PE 18:0': nan,
 'Lyso PA 18:1': nan,
 'Lyso PG 16:0': nan,
 'Lyso PS 17:1': nan,
 'SM d18:1-16:0': nan,
 'PC (O) C16-18:1': nan,
 'Cholesteryl ester 17:0': nan}

- Complete the mapping manually

In [6]:
mapping = {'N-Acetylgalactosamine 6-phosphate': 'C06376',
 'Cholesteryl acetate': 'C02530',
 'TG 15:0-18:1-15:0': 'C00422',
 'MG 18:1': 'C01885',
 'Cardiolipin 18:1': 'C05980',
 'DG 18:0-22:6': 'C00165',
 'PS (POPS) 16:0-18:1': 'C02737',
 'PG 16:0-18:1': 'C00344',
 'Lyso PI 17:1': 'C03819',
 'PI 16:0-18:1': 'C01194',
 'PE 18:0-20:4': 'C00350',
 'Lyso PE 18:0': 'C04438',
 'Lyso PA 18:1': 'C00681',
 'Lyso PG 16:0': 'C18126',
 'Lyso PS 17:1': 'C18125',
 'SM d18:1-16:0': 'C00550',
 'PC (O) C16-18:1': 'C05212',
 'Cholesteryl ester 17:0': 'C02530'}

In [7]:
for x in mapping.keys():
    df.loc[df.name_short == x, 'kegg_id'] = mapping[x]

- Add chemical class information and assign colour to each class

In [8]:
chem_class = pd.read_csv(p_chem_class)[['internal_id', 'coarse_class']]
df = df.merge(chem_class, on='internal_id', how='left')

In [9]:
palette = dict({ #Paul Tol palette for colour blindness
     "Amines":'#DDCC77', #yellow
     "Amino acids, peptides, and analogues":'#332288', #blue
     "Carbohydrates":'#117733', #green
     "Carboxylic acids":'#44AA99', #emerald
     "Lipids and lipid-like molecules":'#AA4499', #magenta
     "Nucleosides, nucleotides, and analogues":'#CC6677', #light red
     "Vitamins and cofactors":'#88CCEE', #lightblue
    })

- Print and copy KEGG IDs to IPath3 website [https://pathways.embl.de/ipath3.cgi?map=metabolic](https://pathways.embl.de/ipath3.cgi?map=metabolic)

In [10]:
ids = [print(f"{x.kegg_id} {palette[x.coarse_class]} W16") for x in df.itertuples()]
# ids = [print(f"{x.kegg_id} #000000 W16") for x in df.itertuples()]

C00221 #117733 W16
C00092 #117733 W16
C05378 #117733 W16
C00111 #117733 W16
C00597 #44AA99 W16
C00597 #44AA99 W16
C00074 #44AA99 W16
C00022 #44AA99 W16
C00024 #88CCEE W16
C00024 #CC6677 W16
C00186 #44AA99 W16
C00345 #117733 W16
C00345 #44AA99 W16
C00199 #117733 W16
C03736 #117733 W16
C00184 #117733 W16
C00257 #117733 W16
C00257 #44AA99 W16
C00508 #117733 W16
C00158 #44AA99 W16
C00158 #44AA99 W16
C00417 #44AA99 W16
C00026 #44AA99 W16
C00042 #44AA99 W16
C00122 #44AA99 W16
C00149 #44AA99 W16
C00036 #44AA99 W16
C00029 #CC6677 W16
C00167 #CC6677 W16
C00352 #117733 W16
C00352 #117733 W16
C06376 #117733 W16
C06376 #117733 W16
C00043 #CC6677 W16
C00140 #117733 W16
C00137 #117733 W16
C01041 #44AA99 W16
C01177 #117733 W16
C00249 #AA4499 W16
C00418 #44AA99 W16
C00187 #AA4499 W16
C04025 #44AA99 W16
C03761 #44AA99 W16
C00154 #88CCEE W16
C00154 #AA4499 W16
C00154 #CC6677 W16
C00318 #332288 W16
C00318 #DDCC77 W16
C00695 #AA4499 W16
C00037 #332288 W16
C05122 #AA4499 W16
C00951 #AA4499 W16
C00152 #3322

- Save results

In [11]:
df.to_csv(p_analysis / 'compound_kegg_mapping_2.csv')