In [None]:
import pandas as pd

In [7]:
import cobra.io
import gzip

with gzip.open('iJN1463.json.gz') as file:
    model = cobra.io.load_json_model(file)

In [8]:
model.genes[2].annotation

{'ncbigene': ['1041963'],
 'ncbigi': ['26987960'],
 'refseq_locus_tag': ['PP_1225'],
 'refseq_name': ['queE'],
 'sbo': 'SBO:0000243'}

In [9]:
model.metabolites.glyc_c.annotation

{'bigg.metabolite': ['glyc'],
 'biocyc': ['META:GLYCEROL'],
 'chebi': ['CHEBI:5448',
  'CHEBI:14334',
  'CHEBI:42998',
  'CHEBI:17754',
  'CHEBI:24351',
  'CHEBI:131422'],
 'envipath': ['32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/compound/48edd8a9-6a4a-4900-b0cf-97b1ccaa1125',
  '650babc9-9d68-4b73-9332-11972ca26f7b/compound/2e8c8093-3269-46b3-be2e-f1df9fdde25c'],
 'hmdb': ['HMDB00131'],
 'inchi_key': ['PEDCQBHIVMGVHV-UHFFFAOYSA-N'],
 'kegg.compound': ['C00116'],
 'kegg.drug': ['D00028'],
 'metanetx.chemical': ['MNXM89612'],
 'reactome.compound': ['192438', '76116'],
 'sabiork': ['1303'],
 'sbo': 'SBO:0000247',
 'seed.compound': ['cpd00100']}

In [10]:
import requests
import json
from tqdm import tqdm

In [11]:
import warnings

In [14]:
import time

In [15]:
def parse_rest_response(url):

    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.content.decode('utf-8'))
    else:
        return None
    
def parse_cid(cid, to='CHEBI'):
    response = parse_rest_response(f'http://cts.fiehnlab.ucdavis.edu/service/convert/PubChem%20CID/{to}/{cid}')
    if response:
        return response[0]['result']
    else:
        warnings.warn(f'Error with cid {cid}')
        return []


to_try = ['KEGG', 'ChEBI', 'BioCyc', 'InChIKey']

def iter_matches(unique_cids):
    for cid in tqdm(unique_cids):
        for db in to_try:
            for result in parse_cid(cid, to=db):
                time.sleep(1)
                yield pd.Series({
                    'cid': cid, 
                    'db': db, 
                    'identifier': result})


In [17]:
unique_cids = pd.Series(df_metabolite['Formal Type'].unique()).str.strip('cid:')
results_database = pd.DataFrame(iter_matches(unique_cids.values))

100%|██████████| 125/125 [09:54<00:00,  4.76s/it]


In [52]:
unique_cids.shape

(125,)

In [23]:
results_database.to_csv('cid_matches.csv', index=False)

In [22]:
results_database

Unnamed: 0,cid,db,identifier
0,5356793,ChEBI,CHEBI:27036
1,5356793,ChEBI,CHEBI:38407
2,5356793,InChIKey,TXXHDPDFNKHHGW-ZPUQHVIOSA-N
3,6912,InChIKey,HEBKCHPVOIAQTA-NGQZWQHPSA-N
4,86860,InChIKey,RTLDJXGEOSVJEX-UHFFFAOYSA-N
...,...,...,...
422,19782904,KEGG,D00037
423,19782904,ChEBI,CHEBI:30769
424,19782904,InChIKey,KRKNYBCHXYNGOX-UHFFFAOYSA-N
425,21363,ChEBI,CHEBI:37277


In [53]:
def iter_model_annotations():
    for met in tqdm(model.metabolites.query('c', 'compartment')):
        for database, values in met.annotation.items():
            for value in values:
                yield pd.Series({
                    'metabolite': met.id, 
                    'db': database, 
                    'identifier': value})
        

In [54]:
model_annotations = pd.DataFrame(iter_model_annotations())

100%|██████████| 1339/1339 [00:05<00:00, 243.27it/s]


In [56]:
model_annotations.db.unique()

array(['bigg.metabolite', 'biocyc', 'chebi', 'hmdb', 'inchi_key',
       'kegg.compound', 'metanetx.chemical', 'reactome.compound',
       'sabiork', 'sbo', 'seed.compound', 'envipath', 'lipidmaps', 'slm',
       'kegg.drug', 'kegg.glycan'], dtype=object)

In [57]:
results_database.db.unique()

array(['ChEBI', 'InChIKey', 'KEGG', 'BioCyc'], dtype=object)

In [58]:
model_annotations['db'] = model_annotations.db.replace(
    {'biocyc': 'BioCyc',
     'chebi': 'ChEBI',
     'kegg.compound': 'KEGG',
     'inchi_key': 'InChIKey'
    })

In [59]:
model_annotations

Unnamed: 0,metabolite,db,identifier
0,10fthf_c,bigg.metabolite,10fthf
1,10fthf_c,BioCyc,META:10-FORMYL-THF
2,10fthf_c,ChEBI,CHEBI:19108
3,10fthf_c,ChEBI,CHEBI:15637
4,10fthf_c,ChEBI,CHEBI:698
...,...,...,...
28589,malttr_c,sbo,0
28590,malttr_c,sbo,2
28591,malttr_c,sbo,4
28592,malttr_c,sbo,7


In [60]:
merged_identifiers = results_database.merge(model_annotations, how='inner', on=['db', 'identifier'])

In [62]:
len(merged_identifiers.metabolite.unique())

71

In [79]:
unique_merges = merged_identifiers = merged_identifiers.drop_duplicates(subset=['cid', 'metabolite'])
unique_merges

Unnamed: 0,cid,db,identifier,metabolite
0,3035456,KEGG,C06473,2dhglcn_c
2,72,KEGG,C00230,34dhbz_c
3,22639876,KEGG,C01353,hco3_c
5,5280518,KEGG,C02480,ccmuac_c
7,10690,KEGG,C00257,glcn_c
...,...,...,...,...
163,91493,KEGG,C00345,6pgc_c
164,190,KEGG,C00147,ade_c
167,6083,KEGG,C00020,amp_c
169,289,KEGG,C00090,catechol_c


In [126]:
# manual matches
unique_merges_manual = unique_merges.append(pd.DataFrame((
    {'cid': '724', 'metabolite': '3pg'},
    {'cid': '6902', 'metabolite': 'arab__L_c'},
    {'cid': '439163', 'metabolite': 'fru_p'},
    {'cid': '6036', 'metabolite': 'udpgal_c'},
    {'cid': '7027', 'metabolite': '6pgl_c'})),
    ignore_index=True)

In [130]:
unique_merges_manual.to_csv('cid_to_bigg_matches.csv', index=False)

In [76]:
unique_mets = df_metabolite[['Formal Type', 'Measurement Type']].drop_duplicates()
unique_mets['cid'] = unique_mets['Formal Type'].str.strip('cid:')

In [128]:
unique_mets[~unique_mets.cid.isin(unique_merges_manual.cid)].merge(results_database.drop_duplicates(subset='cid'))

Unnamed: 0,Formal Type,Measurement Type,cid,db,identifier
0,cid:5356793,"(2E,4E)-muconic acid",5356793,ChEBI,CHEBI:27036
1,cid:6912,"(2S,4R)-pentane-1,2,3,4,5-pentol",6912,InChIKey,HEBKCHPVOIAQTA-NGQZWQHPSA-N
2,cid:86860,2-(2-furyl)-2-hydroxy-acetic acid,86860,InChIKey,RTLDJXGEOSVJEX-UHFFFAOYSA-N
3,cid:53440600,hex-2-enedioic acid,53440600,InChIKey,HSBSUGYTMJWPAX-UHFFFAOYSA-N
4,cid:102424,"(3S,4R,5S)-3,4,5,6-tetrahydroxy-2-keto-hexanoi...",102424,KEGG,C15673
5,cid:345901,"2-methylpropane-1,2,3-triol",345901,InChIKey,OOJRTGIXWIUBGG-UHFFFAOYSA-N
6,cid:69507,"[(2R,3R,4S)-2,3,4,6-tetrahydroxy-5-keto-hexyl]...",69507,ChEBI,CHEBI:15946
7,cid:439958,"[(2R,3R,4S,5R)-2,3,4,5-tetrahydroxy-6-keto-hex...",439958,KEGG,C03251
8,cid:6251,"(2R,3R,4R,5R)-hexane-1,2,3,4,5,6-hexol",6251,KEGG,C00392
9,cid:5289590,"(3~{S},4~{R})-1,3,4,5-tetrahydroxypentan-2-one",5289590,ChEBI,CHEBI:17140


In [61]:
pd.Series(model_annotations.metabolite.unique()).isin(merged_identifiers.metabolite).sum()

71