# Dev JMS to mcg3



In [1]:
!pip install --upgrade jms-metabolite-services khipu-metabolomics

Requirement already up-to-date: jms-metabolite-services in /opt/conda/lib/python3.7/site-packages (0.5.4)
Requirement already up-to-date: khipu-metabolomics in /opt/conda/lib/python3.7/site-packages (0.6.0)


In [2]:
import json
from khipu.epdsConstructor import epdsConstructor
from jms.dbStructures import knownCompoundDatabase, ExperimentalEcpdDatabase
from jms.io import read_table_to_peaks
from jms.modelConvert import convert_json_model, DataMeetModel

In [3]:
_m = json.load(open('metabolicModel_az_HumanGEM_20220302_noCompartmentalization.json'))
print(_m.keys())
mmodel = convert_json_model(_m)
mmodel.keys()

dict_keys(['id', 'list_of_reactions', 'list_of_compounds', 'list_of_pathways', 'meta_data'])


dict_keys(['id', 'version', 'Compounds', 'dict_cpds_def', 'metabolic_rxns', 'cpd_edges', 'edge2rxn', 'edge2enzyme', 'metabolic_pathways', 'cpd2pathways'])

In [4]:
list(mmodel['Compounds'].values())[5]

{'id': 'MAM00006',
 'name': '(11Z)-docosenoyl-CoA',
 'identifiers': [['humanGEM', 'MAM00006'], ['vmhmetabolite', 'M00006']],
 'neutral_formula': 'C43H76N7O17P3S',
 'charge': -4,
 'charged_formula': 'C43H72N7O17P3S',
 'neutral_mono_mass': 1087.42312618056,
 'SMILES': '',
 'inchi': '',
 'neutral_formula_mass': 1087.42312618056}

In [5]:
masses = [v['neutral_formula_mass'] for v in mmodel['Compounds'].values()]
masses = sorted(list(set(masses)))
print(len(masses))


2120


In [6]:
masses = [x for x in masses if 80 < x < 1000]
print(len(masses))

1748


In [7]:
masses[200: 205]

[161.04767846897,
 161.06880783691003,
 161.10519334614003,
 161.90421231733998,
 162.01643791062003]

In [8]:
len(mmodel['Compounds'])

3384

## Peek how KCD works

Not needed as KCD is used inside DataMeetModel 

In [9]:
KCD = knownCompoundDatabase()
KCD.mass_index_list_compounds(mmodel['Compounds'].values())
KCD.build_emp_cpds_index()

In [10]:
list(KCD.mass_indexed_compounds.items())[55]

('C33H56N7O18P3S_963.26154',
 {'interim_id': 'C33H56N7O18P3S_963.26154',
  'neutral_formula': 'C33H56N7O18P3S',
  'neutral_formula_mass': 963.2615401561299,
  'compounds': [{'id': 'MAM00082',
    'name': '(3S)-3-hydroxydodec-cis-6-enoyl-CoA',
    'identifiers': [['humanGEM', 'MAM00082'],
     ['pubchem.compound', '53481428'],
     ['vmhmetabolite', 'CE2420'],
     ['metanetx.chemical', 'MNXM31119']],
    'neutral_formula': 'C33H56N7O18P3S',
    'charge': -4,
    'charged_formula': 'C33H52N7O18P3S',
    'neutral_mono_mass': 963.2615401561299,
    'SMILES': '',
    'inchi': '',
    'neutral_formula_mass': 963.2615401561299},
   {'id': 'MAM00868',
    'name': '3-oxododecanoyl-CoA',
    'identifiers': [['humanGEM', 'MAM00868'],
     ['bigg.metabolite', '3oddcoa'],
     ['kegg.compound', 'C05263'],
     ['chebi', '27868'],
     ['pubchem.compound', '440604'],
     ['lipidmaps', 'LMFA07050013'],
     ['vmhmetabolite', '3oddcoa'],
     ['metanetx.chemical', 'MNXM705']],
    'neutral_formula':

In [11]:
KCD.emp_cpds_trees['pos'][96426]

[{'mz': 964.2688166228999,
  'parent_epd_id': 'C33H56N7O18P3S_963.26154',
  'ion_relation': 'M+H[1+]'},
 {'mz': 964.2688519389299,
  'parent_epd_id': 'C33H54N7O17P3S_945.250975',
  'ion_relation': 'M+H2O+H[1+]'},
 {'mz': 964.2618323695401,
  'parent_epd_id': 'C44H50CaF2N6O10S2_964.262381',
  'ion_relation': 'M[1+]'}]

In [12]:
KCD.search_mz_single(964.261, 'pos', 5)

[{'mz': 964.2618323695401,
  'parent_epd_id': 'C44H50CaF2N6O10S2_964.262381',
  'ion_relation': 'M[1+]'}]

In [13]:
KCD.mass_indexed_compounds['C44H50CaF2N6O10S2_964.262381']

{'interim_id': 'C44H50CaF2N6O10S2_964.262381',
 'neutral_formula': 'C44H50CaF2N6O10S2',
 'neutral_formula_mass': 964.2623813695401,
 'compounds': [{'id': 'MAM03920',
   'name': 'rosuvastatin-5S-lactone',
   'identifiers': [['humanGEM', 'MAM03920'],
    ['bigg.metabolite', 'rsvlac'],
    ['vmhmetabolite', 'rsvlac']],
   'neutral_formula': 'C44H50CaF2N6O10S2',
   'charge': 2,
   'charged_formula': 'C44H52CaF2N6O10S2',
   'neutral_mono_mass': 964.2623813695401,
   'SMILES': '',
   'inchi': '',
   'neutral_formula_mass': 964.2623813695401}]}

## Input user data

In [14]:
# help(read_table_to_peaks)

# ff = read_table_to_peaks("testdata0710.txt", mz_col=0, rtime_col=1, feature_id=4, full_extract=True)

ff2 = read_table_to_peaks("preferred_Feature_table.tsv", mz_col=1, rtime_col=2, feature_id=0, full_extract=True)
print(len(ff2), ff2[:2])

10166 [{'id_number': 'F2', 'mz': 69.0453, 'rtime': 1.0, 'rtime_left_base': '0.46', 'rtime_right_base': '2.88', 'parent_masstrack_id': '4', 'peak_area': '353341129', 'cSelectivity': '0.82', 'goodness_fitting': '0.98', 'snr': '27', 'detection_counts': '15', 'MT_20221018_093': '23566344', 'MT_20221018_013': '14424637', 'MT_20221018_021': '28418184', 'MT_20221018_063': '22903638', 'MT_20221018_065': '18022378', 'MT_20221018_073': '21919355', 'MT_20221018_081': '26009080', 'MT_20221018_085': '24156015', 'MT_20221018_087': '23769280', 'MT_20221018_121': '19344598', 'MT_20221018_127': '19197467', 'MT_20221018_131': '20864825', 'MT_20221018_133': '16788621', 'MT_20221018_135': '20974472', 'MT_20221018_143': '23961868\n', 'apex': 1.0}, {'id_number': 'F3', 'mz': 118.1098, 'rtime': 137.49, 'rtime_left_base': '135.28', 'rtime_right_base': '139.29', 'parent_masstrack_id': '686', 'peak_area': '7603634', 'cSelectivity': '1.0', 'goodness_fitting': '0.98', 'snr': '563', 'detection_counts': '12', 'MT_20

**Testing code**

In [15]:
EED = ExperimentalEcpdDatabase(mode='pos', mz_tolerance_ppm=5, rt_tolerance=2)
EED.build_from_list_peaks(ff2)
EED.extend_empCpd_annotation(KCD)
EED.annotate_singleton_mummichog(KCD)



Initial khipu search grid: 
               M+H+       Na/H        HCl        K/H        ACN
M0         1.007276  22.989276  36.983976  38.963158  42.033825
13C/12C    2.010631  23.992631  37.987331  39.966513  43.037180
13C/12C*2  3.013986  24.995986  38.990686  40.969868  44.040535


Empty network -  ['F3631'] [] [] []
Empty network -  ['F1892'] [] [] []
Empty network -  ['F2931'] [] [] []
Empty network -  ['F299'] [] [] []
Unknown isotope match ~  (93.0656, 'F1428')
Empty network -  ['F5825'] [] [] []
Downsized input network with 17 features, highest peak at F5687 
Empty network -  ['F4932'] [] [] []
Empty network -  ['F2679'] [] [] []
Empty network -  ['F7840'] [] [] []
Empty network -  ['F8877'] [] [] []
Downsized input network with 20 features, highest peak at F2716 
Empty network -  ['F229'] [] [] []
Empty network -  ['F2716'] [] [] []
Unknown isotope match ~  (111.0808, 'F3352')
Downsized input network with 24 features, highest peak at F3622 
Empty network -  ['F9768'] [] [] [

In [16]:
len(EED.dict_empCpds)

2889

In [17]:
matched = [x for x in EED.dict_empCpds.values() if 'list_matches' in x]
print( len(matched), matched[55] )

1240 {'interim_id': 'kp204_202.1317', 'neutral_formula_mass': 202.13169603323, 'neutral_formula': None, 'Database_referred': [], 'identity': [], 'MS1_pseudo_Spectra': [{'id_number': 'F3684', 'mz': 203.1389, 'rtime': 23.04, 'rtime_left_base': '21.88', 'rtime_right_base': '25.37', 'parent_masstrack_id': '2495', 'peak_area': '402792917', 'cSelectivity': '0.86', 'goodness_fitting': '0.98', 'snr': '120', 'detection_counts': '15', 'MT_20221018_093': '171647657', 'MT_20221018_013': '44197511', 'MT_20221018_021': '22144605', 'MT_20221018_063': '54382616', 'MT_20221018_065': '90272901', 'MT_20221018_073': '73231696', 'MT_20221018_081': '139954513', 'MT_20221018_085': '98197377', 'MT_20221018_087': '58414834', 'MT_20221018_121': '110314351', 'MT_20221018_127': '62762335', 'MT_20221018_131': '37828795', 'MT_20221018_133': '90738788', 'MT_20221018_135': '95355869', 'MT_20221018_143': '106724805\n', 'apex': 23.04, 'representative_intensity': '402792917', 'id': 'F3684', 'isotope': 'M0', 'modificatio

In [18]:
KCD.mass_indexed_compounds['C9H18N2O3_202.131742']

{'interim_id': 'C9H18N2O3_202.131742',
 'neutral_formula': 'C9H18N2O3',
 'neutral_formula_mass': 202.13174244717,
 'compounds': [{'id': 'MAM03375',
   'name': 'L-Alanyl-L-Leucine',
   'identifiers': [['humanGEM', 'MAM03375'],
    ['bigg.metabolite', 'CE5866'],
    ['pubchem.compound', '6992388'],
    ['vmhmetabolite', 'CE5866'],
    ['metanetx.chemical', 'MNXM15786'],
    ['inchi',
     'InChI=1S/C9H18N2O3/c1-5(2)4-7(9(13)14)11-8(12)6(3)10/h5-7H,4,10H2,1-3H3,(H,11,12)(H,13,14)/t6-,7+/m1/s1']],
   'neutral_formula': 'C9H18N2O3',
   'charge': 0,
   'charged_formula': 'C9H18N2O3',
   'neutral_mono_mass': 202.13174244717,
   'SMILES': '',
   'inchi': '',
   'neutral_formula_mass': 202.13174244717}]}

In [19]:
all_match_cpds = set()
for m in matched:
    for x in m['list_matches']:
        all_match_cpds.add(x[0])
        
print( len(all_match_cpds) )

675


**Found 675 matched cpds from the model**

Got 1974 khipus, with 5735;

Total 2889 empCpds, including singletons.

In [20]:
# Build DataMeetModel
# parameters=default_parameters
DMM = DataMeetModel(mmodel, ff2)
DMM.adduct_patterns

[(21.982, 'Na/H'), (41.026549, 'ACN'), (35.9767, 'HCl'), (37.955882, 'K/H')]

In [21]:
dict_empCpds = DMM.match_all()



Initial khipu search grid: 
               M+H+       Na/H        HCl        K/H        ACN
M0         1.007276  22.989276  36.983976  38.963158  42.033825
13C/12C    2.010631  23.992631  37.987331  39.966513  43.037180
13C/12C*2  3.013986  24.995986  38.990686  40.969868  44.040535


Empty network -  ['F3631'] [] [] []
Empty network -  ['F1892'] [] [] []
Empty network -  ['F2931'] [] [] []
Empty network -  ['F299'] [] [] []
Unknown isotope match ~  (93.0656, 'F1428')
Empty network -  ['F5825'] [] [] []
Downsized input network with 17 features, highest peak at F5687 
Empty network -  ['F4932'] [] [] []
Empty network -  ['F2679'] [] [] []
Empty network -  ['F7840'] [] [] []
Empty network -  ['F8877'] [] [] []
Downsized input network with 20 features, highest peak at F2716 
Empty network -  ['F229'] [] [] []
Empty network -  ['F2716'] [] [] []
Unknown isotope match ~  (111.0808, 'F3352')
Downsized input network with 24 features, highest peak at F3622 
Empty network -  ['F9768'] [] [] [

In [24]:
list(dict_empCpds.values())[300]

{'interim_id': 'kp301_269.3081',
 'neutral_formula_mass': 269.30814603323,
 'neutral_formula': None,
 'Database_referred': [],
 'identity': [],
 'MS1_pseudo_Spectra': [{'id_number': 'F5270',
   'mz': 271.3188,
   'rtime': 17.3,
   'rtime_left_base': '16.96',
   'rtime_right_base': '18.88',
   'parent_masstrack_id': '3946',
   'peak_area': '10913252',
   'cSelectivity': '1.0',
   'goodness_fitting': '0.91',
   'snr': '34',
   'detection_counts': '15',
   'MT_20221018_093': '593828',
   'MT_20221018_013': '978794',
   'MT_20221018_021': '632486',
   'MT_20221018_063': '494116',
   'MT_20221018_065': '599926',
   'MT_20221018_073': '528746',
   'MT_20221018_081': '611566',
   'MT_20221018_085': '1450287',
   'MT_20221018_087': '578496',
   'MT_20221018_121': '704241',
   'MT_20221018_127': '1082292',
   'MT_20221018_131': '720674',
   'MT_20221018_133': '911761',
   'MT_20221018_135': '547266',
   'MT_20221018_143': '634420\n',
   'apex': 17.3,
   'representative_intensity': '10913252',
 

In [25]:
# check newly matched
matched = [x for x in dict_empCpds.values() if 'list_matches' in x]
print( len(matched), matched[55] )

1240 {'interim_id': 'kp204_202.1317', 'neutral_formula_mass': 202.13169603323, 'neutral_formula': None, 'Database_referred': [], 'identity': ['MAM03375'], 'MS1_pseudo_Spectra': [{'id_number': 'F3684', 'mz': 203.1389, 'rtime': 23.04, 'rtime_left_base': '21.88', 'rtime_right_base': '25.37', 'parent_masstrack_id': '2495', 'peak_area': '402792917', 'cSelectivity': '0.86', 'goodness_fitting': '0.98', 'snr': '120', 'detection_counts': '15', 'MT_20221018_093': '171647657', 'MT_20221018_013': '44197511', 'MT_20221018_021': '22144605', 'MT_20221018_063': '54382616', 'MT_20221018_065': '90272901', 'MT_20221018_073': '73231696', 'MT_20221018_081': '139954513', 'MT_20221018_085': '98197377', 'MT_20221018_087': '58414834', 'MT_20221018_121': '110314351', 'MT_20221018_127': '62762335', 'MT_20221018_131': '37828795', 'MT_20221018_133': '90738788', 'MT_20221018_135': '95355869', 'MT_20221018_143': '106724805\n', 'apex': 23.04, 'representative_intensity': '402792917', 'id': 'F3684', 'isotope': 'M0', 'm

In [26]:
matched[630]

{'interim_id': 'epd_F7704',
 'MS1_pseudo_Spectra': [{'id_number': 'F7704',
   'mz': 286.2291,
   'rtime': 16.1,
   'rtime_left_base': '15.4',
   'rtime_right_base': '16.96',
   'parent_masstrack_id': '4248',
   'peak_area': '5126118',
   'cSelectivity': '1.0',
   'goodness_fitting': '0.96',
   'snr': '1043',
   'detection_counts': '6',
   'MT_20221018_093': '558723',
   'MT_20221018_013': '0',
   'MT_20221018_021': '0',
   'MT_20221018_063': '0',
   'MT_20221018_065': '0',
   'MT_20221018_073': '1138426',
   'MT_20221018_081': '0',
   'MT_20221018_085': '0',
   'MT_20221018_087': '916117',
   'MT_20221018_121': '0',
   'MT_20221018_127': '865752',
   'MT_20221018_131': '0',
   'MT_20221018_133': '1149923',
   'MT_20221018_135': '0',
   'MT_20221018_143': '346389\n',
   'apex': 16.1,
   'representative_intensity': '5126118',
   'id': 'F7704'}],
 'list_matches': [('C20H30O_286.229666', 'M[1+]', 1)],
 'identity': ['MAM00291', 'MAM00351', 'MAM01232', 'MAM02834']}

In [27]:
all_match_cpds = set()
for m in matched:
    for x in m['list_matches']:
        all_match_cpds.add(x[0])
        
print( len(all_match_cpds) )

675


In [None]:
'''
#
# One can specify parameters to DataMeetModel
#

from khipu.utils import adduct_search_patterns, \
                            adduct_search_patterns_neg, \
                                isotope_search_patterns, \
                                    extended_adducts
para = {
    'mode': 'pos',
    'isotope_search_patterns': isotope_search_patterns[:2],
    'adduct_patterns': adduct_search_patterns,
    'extended_adducts': extended_adducts,
    'mz_tolerance_ppm': 5,
    'rt_tolerance': 2,
}
'''

In [30]:
def check_13C(empCpd):
    ions = [x['isotope'] for x in empCpd['MS1_pseudo_Spectra'] if 'isotope' in x]
    if '13C/12C' in ions:
        return True
    else:
        return False

In [31]:
len(
    [x for x in dict_empCpds.values() if check_13C(x)]
    )

1373

## Conclusion

We used jms.modelConvert.convert_json_model and DataMeetModel to reformat metabolic models and match to user input feature list.

The result from DataMeetModel.match_all() is a list of empirical compounds that contain the matching information in field `identity`. This list is used in mummichog3 as the connection btw user data and a metabolic model.