# Annotation of SZ22 by khipu, using feature tables from asari vs XCMS

SL 2023-1-29

In [1]:
import json

In [18]:
asari = json.load(open('szasari.json'))
xcms = json.load(open('szxcmsh.json'))

In [19]:
len(asari), len(xcms)

(295, 268)

In [20]:
asari[0]

{'interim_id': 'kp1_128.0583',
 'neutral_formula_mass': 128.05827686656332,
 'neutral_formula': None,
 'Database_referred': [],
 'identity': [],
 'MS1_pseudo_Spectra': [{'id': 'F37',
   'mz': 134.0824,
   'rtime': 25.36,
   'intensities': [23.79, 28.07, 268.0],
   'representative_intensity': 106.62,
   'parent_masstrack_id': '134.0824',
   'isotope': '13C/12C*5',
   'modification': 'M+H+',
   'ion_relation': '13C/12C*5,M+H+'},
  {'id': 'F2',
   'mz': 132.0752,
   'rtime': 24.48,
   'intensities': [22.38, 26.91, 257.0],
   'representative_intensity': 102.09666666666668,
   'parent_masstrack_id': '132.0752',
   'isotope': '13C/12C*3',
   'modification': 'M+H+',
   'ion_relation': '13C/12C*3,M+H+'},
  {'id': 'F1378',
   'mz': 129.0659,
   'rtime': 25.58,
   'intensities': [22.86, 28.31, 1091.0],
   'representative_intensity': 380.72333333333336,
   'parent_masstrack_id': '129.0659',
   'isotope': 'M0',
   'modification': 'M+H+',
   'ion_relation': 'M0,M+H+'}],
 'MS2_Spectra': []}

In [21]:
# Get epds with both M0 and M2 or beyond

def is_good(p):
    '''M1 is '13C/12C', no '*'.
    '''
    good = False
    isotopes = []
    for x in p['MS1_pseudo_Spectra']:
        if 'isotope' in x:
            isotopes.append(x['isotope'])
    
    if 'M0' in isotopes:
        remaining = [x for x in isotopes if '13C/12C*' in x]
        if remaining:
            good = True
    return good
    

In [10]:
is_good(asari[0])

True

In [22]:
asari_good = [p for p in asari if is_good(p)]
xcms_good = [p for p in xcms if is_good(p)]
len(asari_good), len(xcms_good)

(243, 197)

In [14]:
xcms_good[10]

{'interim_id': 'kp12_77.0989',
 'neutral_formula_mass': 77.09886421823,
 'neutral_formula': None,
 'Database_referred': [],
 'identity': [],
 'MS1_pseudo_Spectra': [{'id': 'FT0142',
   'mz': 82.11957365,
   'rtime': 27.05752945,
   'intensities': [60246.289, 63496.57029, 46331.53318],
   'representative_intensity': 56691.46415666666,
   'parent_masstrack_id': '82.11957365',
   'isotope': '13C/12C*4',
   'modification': 'M+H+',
   'ion_relation': '13C/12C*4,M+H+'},
  {'id': 'FT0101',
   'mz': 78.10612772,
   'rtime': 27.13708896,
   'intensities': [199027.7776, 209735.1529, 175678.5478],
   'representative_intensity': 194813.82610000003,
   'parent_masstrack_id': '78.10612772',
   'isotope': 'M0',
   'modification': 'M+H+',
   'ion_relation': 'M0,M+H+'}],
 'MS2_Spectra': []}

In [24]:
# compare neutral_formula_mass

nms_asari = [str(round(p['neutral_formula_mass'],2)) for p in asari_good]
nms_xcms = [str(round(p['neutral_formula_mass'],2)) for p in xcms_good]

op = set(nms_asari).intersection(set(nms_xcms))

print(len(set(nms_asari)), len(set(nms_xcms)), len(op))

196 180 159


In [31]:
# get all M+H peaks from good epds
peaks_asari, peaks_xcms = [], []
for p in asari_good:
    for x in p['MS1_pseudo_Spectra']:
        if 'modification' in x and x['modification'] == 'M+H+':
            peaks_asari.append(x)

for p in xcms_good:
    for x in p['MS1_pseudo_Spectra']:
        if 'modification' in x and x['modification'] == 'M+H+':
            peaks_xcms.append(x)

len(peaks_asari), len(peaks_xcms)



(591, 480)

In [28]:
!pip install --upgrade -q asari-metabolomics
from asari.tools import match_features as mf

In [32]:
valid_matches, dict1, dict2 = mf.bidirectional_best_match(peaks_asari, peaks_xcms, mz_ppm=5, rt_tolerance=6)


    ~~~ By best rtime matches ~~~     

Of 591 list1 features, number of uni-direction matched features is 466.
Of 480 list1 features, number of uni-direction matched features is 429.
~~~ Biodirectional, unique Number of matched feature pairs: ~~~
 429


########################################################################
    ~~~ By best m/z matches ~~~     

Of 591 list1 features, number of uni-direction matched features is 466.
Of 480 list1 features, number of uni-direction matched features is 429.
~~~ Biodirectional, unique Number of matched feature pairs: ~~~
     428
########################################################################




In [38]:
dict2['FT0040']

'F115'

In [34]:
print(len(valid_matches))
valid_matches[0] # considered as true

428


('F37', 'FT0577')

In [39]:
# create merged "EXTRA true feature list"
have1, have2 = [x[0] for x in valid_matches], [x[1] for x in valid_matches]

extra = []
for p in peaks_asari:
    if p['id'] not in have1:
        extra.append(p)

for p in peaks_xcms:
    if p['id'] not in have2:
        extra.append(p)
        
print(len(extra), extra[1])

215 {'id': 'F66', 'mz': 135.0667, 'rtime': 25.36, 'intensities': [22.62, 28.07, 273.0], 'representative_intensity': 107.89666666666666, 'parent_masstrack_id': '135.0667', 'isotope': '13C/12C*5', 'modification': 'M+H+', 'ion_relation': '13C/12C*5,M+H+'}


**True list = 643, i.e. (428 + 215)**

In [40]:
asari_ = mf.get_featureList('SZ22_asari_full_Feature_table.tsv', start_row=1, mz_col=1, rt_col=2, sep='\t')
print(len(asari_), '\n', asari_[3])

xcms_ = mf.get_featureList('SZ22_XCMS_featureTable.txt', start_row=1, mz_col=1, rt_col=2, sep='\t')
print(len(xcms_), '\n', xcms_[3])

1399 
 {'id': 'row5', 'mz': 132.0752, 'rtime': 68.4}
1525 
 {'id': 'row5', 'mz': 69.04478012, 'rtime': 91.34202102}


In [44]:
def compare(list1, list2):
    '''compare matches and print unmatched in list1.
    '''
    print("\n  Best match comparisons:")
    valid_matches, dict1, dict2 = mf.bidirectional_best_match(list1, list2, mz_ppm=5, rt_tolerance=10)

    print("\n  List based inclusive comparisons:")
    dict1, dict2 = mf.bidirectional_match(list1, list2, mz_ppm=5, rt_tolerance=10)

    
    unmatched = [p for p in list1 if p['id'] not in dict1]
    print("\n\nUnmatched features ****** ", len(unmatched), "*******\n")
    unmatched = [(p['mz'], p['rtime']) for p in unmatched]
    for x in unmatched:
        print(x)


In [45]:
compare(extra, asari_)


  Best match comparisons:

    ~~~ By best rtime matches ~~~     

Of 215 list1 features, number of uni-direction matched features is 193.
Of 1399 list1 features, number of uni-direction matched features is 245.
~~~ Biodirectional, unique Number of matched feature pairs: ~~~
 193


########################################################################
    ~~~ By best m/z matches ~~~     

Of 215 list1 features, number of uni-direction matched features is 193.
Of 1399 list1 features, number of uni-direction matched features is 245.
~~~ Biodirectional, unique Number of matched feature pairs: ~~~
     177
########################################################################



  List based inclusive comparisons:
Of 215 list1 features, number of uni-direction matched features is 193.
Of 1399 list1 features, number of uni-direction matched features is 245.
    ~~~ match_numbers ~~~     

Unique Number of matched features in table 1:  121
Unique Number of matched features in table 2:  

In [48]:
(140.994931-140.9957)/140.9957

-5.4540670388610286e-06

In [50]:
(316.1170551-316.1137)/316.1137

1.0613586187569832e-05

In [46]:
compare(extra, xcms_)


  Best match comparisons:

    ~~~ By best rtime matches ~~~     

Of 215 list1 features, number of uni-direction matched features is 153.
Of 1525 list1 features, number of uni-direction matched features is 155.
~~~ Biodirectional, unique Number of matched feature pairs: ~~~
 141


########################################################################
    ~~~ By best m/z matches ~~~     

Of 215 list1 features, number of uni-direction matched features is 153.
Of 1525 list1 features, number of uni-direction matched features is 155.
~~~ Biodirectional, unique Number of matched feature pairs: ~~~
     140
########################################################################



  List based inclusive comparisons:
Of 215 list1 features, number of uni-direction matched features is 153.
Of 1525 list1 features, number of uni-direction matched features is 155.
    ~~~ match_numbers ~~~     

Unique Number of matched features in table 1:  135
Unique Number of matched features in table 2:  

In [47]:
len(asari_), len(xcms_)

(1399, 1525)

## Consclusion

Using similar parameters (peak height > 1E5), asari extracted 1399 features, and XCMS 1525.

len(asari_good), len(xcms_good) =
243, 197

The M+H+ features are 
(591, 480)

Combined, we have established a "True list" of 643 features.

asari missed 22, XCMS missed 62 of these 643 features.

Among the 22 missed by asari, two features were out of the 5 ppm m/z range;
two failed due to gaps or  in 
The remaining 18 features are deemed to be inadequate quality in asari.
