## This notebook prepares input for IPath3 visualisation

In [1]:
from pathlib import Path
from definitions import ROOT_DIR
import pandas as pd
import numpy as np

- Inputs

In [2]:
p_root_dir = Path(ROOT_DIR).parent
p_analysis = p_root_dir / "6_plots\q1_plots"

# Compunds used in the study
p_compounds = p_root_dir / "5_data_analysis/compounds_ids.csv"

# HMDB ID to KEGG ID mapping
p_kegg = p_analysis / "hmdb_kegg_2020-09-09.csv"

# Chemical classification
p_chem_class = p_root_dir / "5_data_analysis/custom_classification.csv"

- Load data

In [3]:
compounds = pd.read_csv(p_compounds)
# Remove entries for thermometer ions and fiducials
compounds = compounds[compounds.hmdb_primary != 'custom'][['name_short', 'hmdb_primary', 'internal_id']]

kegg = pd.read_csv(p_kegg)

- Map KEGG IDs to the compounds
- Note which compounds have no mapping

In [4]:
df = pd.merge(compounds, kegg, how='left', on='hmdb_primary')

In [5]:
unmapped_compounds = df.loc[df.kegg_id != df.kegg_id, 'name_short']
pd.Series(pd.Series(),index=unmapped_compounds).to_dict()

  pd.Series(pd.Series(),index=unmapped_compounds).to_dict()


{'N-Acetylgalactosamine 6-phosphate': nan,
 'Cholesteryl acetate': nan,
 'TG 15:0-18:1-15:0': nan,
 'MG 18:1': nan,
 'Cardiolipin 18:1': nan,
 'DG 18:0-22:6': nan,
 'PS (POPS) 16:0-18:1': nan,
 'PG 16:0-18:1': nan,
 'Lyso PI 17:1': nan,
 'PI 16:0-18:1': nan,
 'PE 18:0-20:4': nan,
 'Lyso PE 18:0': nan,
 'Lyso PA 18:1': nan,
 'Lyso PG 16:0': nan,
 'Lyso PS 17:1': nan,
 'SM d18:1-16:0': nan,
 'PC (O) C16-18:1': nan,
 'Cholesteryl ester 17:0': nan}

- Complete the mapping manually

In [6]:
mapping = {'N-Acetylgalactosamine 6-phosphate': 'C06376',
 'Cholesteryl acetate': 'C02530',
 'TG 15:0-18:1-15:0': 'C00422',
 'MG 18:1': 'C01885',
 'Cardiolipin 18:1': 'C05980',
 'DG 18:0-22:6': 'C00165',
 'PS (POPS) 16:0-18:1': 'C02737',
 'PG 16:0-18:1': 'C00344',
 'Lyso PI 17:1': 'C03819',
 'PI 16:0-18:1': 'C01194',
 'PE 18:0-20:4': 'C00350',
 'Lyso PE 18:0': 'C04438',
 'Lyso PA 18:1': 'C00681',
 'Lyso PG 16:0': 'C18126',
 'Lyso PS 17:1': 'C18125',
 'SM d18:1-16:0': 'C00550',
 'PC (O) C16-18:1': 'C05212',
 'Cholesteryl ester 17:0': 'C02530'}

In [7]:
for x in mapping.keys():
    df.loc[df.name_short == x, 'kegg_id'] = mapping[x]

- Add chemical class information and assign colour to each class

In [8]:
chem_class = pd.read_csv(p_chem_class)[['internal_id', 'coarse_class']]
df = df.merge(chem_class, on='internal_id', how='left')

In [9]:
palette = dict({
 'Carbohydrates':'#7570b3',
 'Carboxylic acids':'#e7298a',
 'Vitamins and cofactors':'#a6761d',
 'Nucleosides, nucleotides, and analogues':'#e6ab02',
 'Lipids and lipid-like molecules':'#66a61e',
 'Amino acids, peptides, and analogues':'#d95f02',
 'Amines':'#1b9e77'
})

- Print and copy KEGG IDs to IPath3 website [https://pathways.embl.de/ipath3.cgi?map=metabolic](https://pathways.embl.de/ipath3.cgi?map=metabolic)

In [12]:
ids = [print(f"{x.kegg_id} {palette[x.coarse_class]} W16") for x in df.itertuples()]
# ids = [print(f"{x.kegg_id} #000000 W16") for x in df.itertuples()]

C00221 #7570b3 W16
C00092 #7570b3 W16
C05378 #7570b3 W16
C00111 #7570b3 W16
C00597 #e7298a W16
C00597 #e7298a W16
C00074 #e7298a W16
C00022 #e7298a W16
C00024 #a6761d W16
C00024 #e6ab02 W16
C00186 #e7298a W16
C00345 #7570b3 W16
C00345 #e7298a W16
C00199 #7570b3 W16
C03736 #7570b3 W16
C00184 #7570b3 W16
C00257 #7570b3 W16
C00257 #e7298a W16
C00508 #7570b3 W16
C00158 #e7298a W16
C00158 #e7298a W16
C00417 #e7298a W16
C00026 #e7298a W16
C00042 #e7298a W16
C00122 #e7298a W16
C00149 #e7298a W16
C00036 #e7298a W16
C00029 #e6ab02 W16
C00167 #e6ab02 W16
C00352 #7570b3 W16
C00352 #7570b3 W16
C06376 #7570b3 W16
C06376 #7570b3 W16
C00043 #e6ab02 W16
C00140 #7570b3 W16
C00137 #7570b3 W16
C01041 #e7298a W16
C01177 #7570b3 W16
C00249 #66a61e W16
C00418 #e7298a W16
C00187 #66a61e W16
C04025 #e7298a W16
C03761 #e7298a W16
C00154 #a6761d W16
C00154 #66a61e W16
C00154 #e6ab02 W16
C00318 #d95f02 W16
C00318 #1b9e77 W16
C00695 #66a61e W16
C00037 #d95f02 W16
C05122 #66a61e W16
C00951 #66a61e W16
C00152 #d95f

- Save results

In [58]:
df.to_csv(p_analysis / 'compound_kegg_mapping_2.csv')