# JMS annotate

SL 2022-02-14

In [1]:
import sys

!pip install asari-metabolomics mass2chem  



In [2]:
sys.path.append('jms')

In [3]:
from jms.dbStructures import knownCompoundDatabase, ExperimentalEcpdDatabase

In [4]:
import json

## Database data (HMDB)

In [5]:
KCD = knownCompoundDatabase()


In [6]:
list_compounds = json.load(open('jms/data/compounds/list_compounds_HMDB4.json'))


In [7]:
KCD.mass_index_list_compounds(list_compounds)

In [8]:

print(list_compounds[2])

{'primary_id': 'HMDB0042005', 'primary_db': 'HMDB', 'name': 'Quinaprilat', 'neutral_formula': 'C23H26N2O5', 'neutral_formula_mass': 410.184171952, 'SMILES': '[H][C@@](C)(N[C@@]([H])(CCC1=CC=CC=C1)C(O)=O)C(=O)N1CC2=CC=CC=C2C[C@@]1([H])C(O)=O', 'inchikey': 'FLSLEGPOVLMJMN-YSSFQJQWSA-N', 'other_ids': {'PubChem': '107994', 'KEGG': '', 'ChEBI': '140296'}}


In [9]:
print(dir(KCD))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'build_emp_cpds_index', 'emp_cpds_trees', 'export_mass_indexed_compounds', 'export_search_emp_cpd_batch', 'mass_index_list_compounds', 'mass_indexed_compounds', 'search_emp_cpd_batch', 'search_emp_cpd_single', 'search_mz_batch', 'search_mz_single', 'short_report_emp_cpd']


In [10]:
list(KCD.mass_indexed_compounds.values())[:2]

[{'interim_id': 'C29H32ClN5O2_517.224453',
  'neutral_formula': 'C29H32ClN5O2',
  'neutral_formula_mass': 517.224453,
  'compounds': [{'primary_id': 'HMDB0042003',
    'primary_db': 'HMDB',
    'name': 'Pyronaridine',
    'neutral_formula': 'C29H32ClN5O2',
    'neutral_formula_mass': 517.224453,
    'SMILES': 'COC1=CC=C2NC3=C(C=CC(Cl)=C3)C(N=C3C=C(CN4CCCC4)C(=O)C(CN4CCCC4)=C3)=C2N1',
    'inchikey': 'YFYLPWJKCSESGB-UHFFFAOYSA-N',
    'other_ids': {'PubChem': '5485198', 'KEGG': '', 'ChEBI': ''}}]},
 {'interim_id': 'C10H9N_143.073499',
  'neutral_formula': 'C10H9N',
  'neutral_formula_mass': 143.073499293,
  'compounds': [{'primary_id': 'HMDB0042004',
    'primary_db': 'HMDB',
    'name': 'Quinaldine',
    'neutral_formula': 'C10H9N',
    'neutral_formula_mass': 143.073499293,
    'SMILES': 'CC1=NC2=CC=CC=C2C=C1',
    'inchikey': 'SMUQFGGVLNAIOZ-UHFFFAOYSA-N',
    'other_ids': {'PubChem': '7060', 'KEGG': '', 'ChEBI': '132813'}},
   {'primary_id': 'HMDB0033115',
    'primary_db': 'HMDB',


In [11]:
KCD.mass_indexed_compounds['C45H54N4O8_778.394165']

{'interim_id': 'C45H54N4O8_778.394165',
 'neutral_formula': 'C45H54N4O8',
 'neutral_formula_mass': 778.394164724,
 'compounds': [{'primary_id': 'HMDB0014505',
   'primary_db': 'HMDB',
   'name': 'Vinorelbine',
   'neutral_formula': 'C45H54N4O8',
   'neutral_formula_mass': 778.394164724,
   'SMILES': '[H][C@@]12N(C)C3=CC(OC)=C(C=C3[C@@]11CCN3CC=C[C@@](CC)([C@@H](OC(C)=O)[C@@]2(O)C(=O)OC)[C@@]13[H])[C@]1(C[C@@]2([H])CN(CC(CC)=C2)CC2=C1NC1=CC=CC=C21)C(=O)OC',
   'inchikey': 'GBABOYUKABKIAF-GHYRFKGUSA-N',
   'other_ids': {'PubChem': '44424639', 'KEGG': '', 'ChEBI': '480999'}}]}

In [12]:
KCD.build_emp_cpds_index()

In [13]:
# query_mz, mode='pos', mz_tolerance_ppm=5
KCD.search_mz_single(130.0172)

[{'mz': 130.017306555,
  'parent_epd_id': 'C4H3FN2O2_130.017856',
  'ion_relation': 'M[1+]'}]

In [14]:
KCD.emp_cpds_trees['pos'][13001]

[{'mz': 130.017306555,
  'parent_epd_id': 'C4H3FN2O2_130.017856',
  'ion_relation': 'M[1+]'}]

In [15]:
KCD.search_mz_batch([115.633, 130.0172])

[[],
 [{'mz': 130.017306555,
   'parent_epd_id': 'C4H3FN2O2_130.017856',
   'ion_relation': 'M[1+]'}]]

## Experimental data

In [16]:
def read_table_to_peaks(infile, delimiter='\t'):
    '''
    return list of peaks, e.g. [
        {
        'id_number': 555,
        'mz': 133.0970, 
        'apex': 654, 
        'height': 14388.0, 
        'left_base': 648, 
        'right_base': 655, 
        }, ...
    ]
    '''
    list_peaks = []
    w = open(infile).readlines()
    for line in w[1:]:
        a = line.rstrip().split(delimiter)
        list_peaks.append(
            {'id_number': a[13], 'mz': float(a[2]), 
            'apex': float(a[3]), 'height': float(a[5]), 
            'cSelectivity': float(a[10]), 'goodness_fitting': float(a[11]), 'snr': float(a[12]), }
        )

    print(len(list_peaks))
    return list_peaks


In [17]:
ipsc = read_table_to_peaks('pos_cmap_feature_table.csv', ',')
ipsc[0]

9786


{'id_number': 'F1',
 'mz': 81.04470247776916,
 'apex': 84.0,
 'height': 172960.79114030077,
 'cSelectivity': 1.0,
 'goodness_fitting': 0.9466921897715592,
 'snr': 35.0}

In [18]:
EED = ExperimentalEcpdDatabase()

In [19]:
EED.build_from_list_peaks(ipsc)



Annotating empirical compounds on 9786 features/peaks, ...
Round 1 - numbers of epds and included peaks:  (1613, 3668)
Round 2 - numbers of epds and included peaks:  (1613, 4260)
Round 3 - numbers of epds and included peaks:  (2181, 5480)


In [20]:
list(EED.dict_empCpds.values())[12]

{'interim_id': 12,
 'neutral_formula_mass': None,
 'neutral_formula': None,
 'Database_referred': [],
 'identity': [],
 'MS1_pseudo_Spectra': [{'feature_id': 'F94',
   'mz': 98.97531935309317,
   'rtime': 700.0,
   'charged_formula': '',
   'ion_relation': 'anchor',
   'parent_epd_id': 12},
  {'feature_id': 'F8161',
   'mz': 99.9786892845517,
   'rtime': 701.0,
   'charged_formula': '',
   'ion_relation': '13C/12C',
   'parent_epd_id': 12},
  {'feature_id': 'F317',
   'mz': 116.98587698692174,
   'rtime': 700.0,
   'charged_formula': '',
   'ion_relation': 'anchor,+H2O',
   'parent_epd_id': 12}],
 'MS2_Spectra': []}

In [21]:
EED.search_peaks_mz_single(143.1260)

[{'id_number': 'F723',
  'mz': 143.12603442721115,
  'apex': 28.0,
  'height': 341842.15591380745,
  'cSelectivity': 1.0,
  'goodness_fitting': 0.9114621608245448,
  'snr': 23.0},
 {'id_number': 'F724',
  'mz': 143.12603442721115,
  'apex': 34.0,
  'height': 348133.74506984063,
  'cSelectivity': 1.0,
  'goodness_fitting': 0.8897818529939553,
  'snr': 24.0},
 {'id_number': 'F725',
  'mz': 143.12603442721115,
  'apex': 387.0,
  'height': 65955.56503863656,
  'cSelectivity': 0.7613636363636364,
  'goodness_fitting': 0.8434391059138034,
  'snr': 4.0},
 {'id_number': 'F726',
  'mz': 143.12603442721115,
  'apex': 403.0,
  'height': 75066.65470082065,
  'cSelectivity': 0.8322147651006712,
  'goodness_fitting': 0.9484010353768128,
  'snr': 5.0},
 {'id_number': 'F727',
  'mz': 143.12603442721115,
  'apex': 115.0,
  'height': 160672.45227231205,
  'cSelectivity': 0.32786885245901637,
  'goodness_fitting': 0.7581664034528794,
  'snr': 11.0}]

In [22]:
EED.search_empCpds_mz_single(143.1260)

[{'feature_id': 'F725',
  'mz': 143.12603442721115,
  'rtime': 387.0,
  'charged_formula': '',
  'ion_relation': 'M(13C),M(34S)',
  'parent_epd_id': 112},
 {'feature_id': 'F726',
  'mz': 143.12603442721115,
  'rtime': 403.0,
  'charged_formula': '',
  'ion_relation': 'M(13C),M(34S)',
  'parent_epd_id': 113}]

In [23]:
EED.search_peaks_mz_batch([143.1260, 411.1841, 201.994])

[[{'id_number': 'F723',
   'mz': 143.12603442721115,
   'apex': 28.0,
   'height': 341842.15591380745,
   'cSelectivity': 1.0,
   'goodness_fitting': 0.9114621608245448,
   'snr': 23.0},
  {'id_number': 'F724',
   'mz': 143.12603442721115,
   'apex': 34.0,
   'height': 348133.74506984063,
   'cSelectivity': 1.0,
   'goodness_fitting': 0.8897818529939553,
   'snr': 24.0},
  {'id_number': 'F725',
   'mz': 143.12603442721115,
   'apex': 387.0,
   'height': 65955.56503863656,
   'cSelectivity': 0.7613636363636364,
   'goodness_fitting': 0.8434391059138034,
   'snr': 4.0},
  {'id_number': 'F726',
   'mz': 143.12603442721115,
   'apex': 403.0,
   'height': 75066.65470082065,
   'cSelectivity': 0.8322147651006712,
   'goodness_fitting': 0.9484010353768128,
   'snr': 5.0},
  {'id_number': 'F727',
   'mz': 143.12603442721115,
   'apex': 115.0,
   'height': 160672.45227231205,
   'cSelectivity': 0.32786885245901637,
   'goodness_fitting': 0.7581664034528794,
   'snr': 11.0}],
 [],
 [{'id_number'

In [24]:
EED.search_peaks_compound_single(list_compounds[199], mz_tolerance_ppm=5)

[('M[1+]', []),
 ('M+H[1+]', []),
 ('M+Na[1+]', []),
 ('M+H2O+H[1+]', []),
 ('M(C13)[1+]', []),
 ('M(C13)+H[1+]', []),
 ('M+2H[2+]', []),
 ('M+3H[3+]', []),
 ('M(C13)+2H[2+]', []),
 ('M(C13)+3H[3+]', []),
 ('M+H+Na[2+]', []),
 ('M+K[1+]', []),
 ('M+NaCl[1+]', []),
 ('M+NH4[1+]', []),
 ('M-H2O+H[1+]', []),
 ('M-H4O2+H[1+]', []),
 ('M-CO+H[1+]', []),
 ('M-CO2+H[1+]', []),
 ('M-HCOOH+H[1+]', []),
 ('M+HCOONa[1+]', []),
 ('M-C3H4O2+H[1+]', []),
 ('M+HCOOK[1+]', [])]

In [25]:
EED.search_peaks_compound_single(list_compounds[199], mz_tolerance_ppm=50)

[('M[1+]', []),
 ('M+H[1+]', []),
 ('M+Na[1+]', []),
 ('M+H2O+H[1+]', []),
 ('M(C13)[1+]', []),
 ('M(C13)+H[1+]', []),
 ('M+2H[2+]', []),
 ('M+3H[3+]', []),
 ('M(C13)+2H[2+]', []),
 ('M(C13)+3H[3+]',
  [{'id_number': 'F3270',
    'mz': 296.9461017847061,
    'apex': 370.0,
    'height': 354415.6666666666,
    'cSelectivity': 1.0,
    'goodness_fitting': 0.7441239970931866,
    'snr': 354415.0}]),
 ('M+H+Na[2+]',
  [{'id_number': 'F4509',
    'mz': 455.3928166802476,
    'apex': 212.0,
    'height': 2851107.61997203,
    'cSelectivity': 1.0,
    'goodness_fitting': 0.9094392761917989,
    'snr': 2851107.0}]),
 ('M+K[1+]', []),
 ('M+NaCl[1+]', []),
 ('M+NH4[1+]', []),
 ('M-H2O+H[1+]', []),
 ('M-H4O2+H[1+]', []),
 ('M-CO+H[1+]', []),
 ('M-CO2+H[1+]', []),
 ('M-HCOOH+H[1+]', []),
 ('M+HCOONa[1+]', []),
 ('M-C3H4O2+H[1+]', []),
 ('M+HCOOK[1+]', [])]

## match emp Cpds 

In [26]:
EED.dict_empCpds[15]

{'interim_id': 15,
 'neutral_formula_mass': None,
 'neutral_formula': None,
 'Database_referred': [],
 'identity': [],
 'MS1_pseudo_Spectra': [{'feature_id': 'F117',
   'mz': 100.11207049661286,
   'rtime': 221.0,
   'charged_formula': '',
   'ion_relation': 'anchor',
   'parent_epd_id': 15},
  {'feature_id': 'F132',
   'mz': 101.11543204162328,
   'rtime': 221.0,
   'charged_formula': '',
   'ion_relation': '13C/12C',
   'parent_epd_id': 15}],
 'MS2_Spectra': []}

In [27]:
KCD.search_emp_cpd_single( EED.dict_empCpds[15] )

[('C6H14N_100.112624', 'M[1+]', 2), ('C6H13N_99.104799', 'M+H[1+]', 2)]

In [28]:
KCD.search_emp_cpd_batch( [EED.dict_empCpds[ii] for ii in range(15,20)] )

[[('C6H14N_100.112624', 'M[1+]', 2), ('C6H13N_99.104799', 'M+H[1+]', 2)],
 [('C4H8N2O_100.063663', 'M+H[1+]', 2),
  ('C4H6N2_82.053098', 'M+H2O+H[1+]', 1)],
 [('C4H7NO2_101.047678', 'M+H[1+]', 2)],
 [('C4H7NO2_101.047678', 'M+H[1+]', 2)],
 []]

In [29]:
search_result = EED.annotate_against_KCD(KCD)

In [30]:
search_result[99]

[('C7H8N2O_136.063663', 'M+H[1+]', 2),
 ('C7H9N2O_137.071488', 'M[1+]', 2),
 ('C7H6N2_118.053098', 'M+H2O+H[1+]', 1)]

In [31]:
EED.list_peaks[2]

{'id_number': 'F3',
 'mz': 82.06515664524503,
 'apex': 624.0,
 'height': 31427.481827291795,
 'cSelectivity': 0.14285714285714285,
 'goodness_fitting': 0.741785986913415,
 'snr': 19.0}

In [32]:
list(EED.peak_to_empCpd.items())[:3]

[('F4', 0), ('F9', 0), ('F5', 1)]

In [33]:
EED.export_annotations(search_result, KCD, "test14")


Annotation of 2181 Empirical compounds was written to test14.tsv.


In [35]:
from jms.dbStructures import annotate_peaks_against_kcds

In [37]:
#
# Test wrapper function. One could run this single wrapper function 
# right after getting the lists of peaks and compounds, and skip most steps above.
#
annotate_peaks_against_kcds(ipsc, list_compounds, 
                                export_file_name_prefix='jms_annotated_',
                                mode='pos',  mz_tolerance_ppm=5)



Annotating empirical compounds on 9786 features/peaks, ...
Round 1 - numbers of epds and included peaks:  (1613, 3668)
Round 2 - numbers of epds and included peaks:  (1613, 4260)
Round 3 - numbers of epds and included peaks:  (2181, 5480)

Annotation of 2181 Empirical compounds was written to jms_annotated_.tsv.


## Summary

The JMS package deals with empCpd grouping in both DB derived and experiment derived data.

We have implemented empCpd based matching. For expt data without empCpd matching, the search falls back to m/z based match.

The wapper function `annotate_peaks_against_kcds` is one step to do all.

Note the export `KCD.export_mass_indexed_compounds(export_file_name_prefix+"KCD_mass_indexed_compounds.json")` can be bulky.