# Do targeted search in asari output
- This is an example to perform targeted search in asari output feature table
- search for PS species

In [4]:
# !pip install -q --upgrade mass2chem khipu-metabolomics jms-metabolite-services

In [5]:
import json
import numpy as np
import pandas as pd

from mass2chem.search import build_centurion_tree, find_all_matches_centurion_indexed_list

In [6]:
import os
output_dir = '../../input/HILICpos_mzML/output_asari_project_224113827_targeted_search/' # here I just used the original asari output
try:
    os.mkdir(output_dir)
except:
    None

In [7]:
epds = json.load(open('../../input/HILICpos_mzML/output_asari_project_224113827_targeted_search/Annotated_empricalCompounds.json'))
target_df = pd.read_excel("../../input/HILICpos_mzML/PS_target_list_to_Minghao.xlsx")

In [8]:
target_df.head()

Unnamed: 0,Common Name,Systematic Name,Species Shorthand,Formula,H,[M-H],Mass
0,PS(12:0/13:0),1-dodecanoyl-2-tridecanoyl-sn-glycero-3-phosph...,PS 25:0,637.395487,1.007825,636.387662,C31H60NO10P
1,"PS(17:0/20:4(5Z,8Z,11Z,14Z))","1-heptadecanoyl, 2-(5Z,8Z,11Z,14Z-eicosatetrae...",PS 37:4,797.520687,1.007825,796.512862,C43H76NO10P
2,"PS(21:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))","1-heneicosanoyl-2-(4Z,7Z,10Z,13Z,16Z,19Z-docos...",PS 43:6,877.583287,1.007825,876.575462,C49H84NO10P
3,PS(17:0/14:1(9Z)),1-heptadecanoyl-2-(9Z-tetradecenoyl)-sn-glycer...,PS 31:1,719.473737,1.007825,718.465912,C37H70NO10P
4,PS(16:0/18:1(11Z)),1-hexadecanoyl-2-(11Z-octadecenoyl)-sn-glycero...,PS 34:1,761.520687,1.007825,760.512862,C40H76NO10P


In [9]:
# Some ids just include indices
[x for x in list(epds.items()) if x[0] == '19115']

[('19115',
  {'interim_id': 19115,
   'neutral_formula_mass': 789.551984778,
   'neutral_formula': 'C42H80NO10P',
   'MS1_pseudo_Spectra': [{'apex': 103,
     'peak_area': 14797245,
     'height': 4209606,
     'left_base': 100,
     'right_base': 108,
     'goodness_fitting': 0.9199423968322258,
     'cSelectivity': 1.0,
     'parent_masstrack_id': 14971,
     'mz': 790.5562896728516,
     'snr': 4209,
     'id_number': 'F18471',
     'rtime': 27.268622999999998,
     'rtime_left_base': 26.4803868,
     'rtime_right_base': 28.5822258,
     'representative_intensity': 14797245,
     'id': 'F18471'}]})]

In [10]:
# Some ids just include indices
[x for x in list(epds.items()) if x[0] == 'kp1253_789.551']

[('kp1253_789.551',
  {'interim_id': 'kp1253_789.551',
   'neutral_formula_mass': 789.5509837399358,
   'neutral_formula': None,
   'Database_referred': [],
   'identity': [],
   'MS1_pseudo_Spectra': [{'apex': 120,
     'peak_area': 11069601,
     'height': 1495130,
     'left_base': 111,
     'right_base': 128,
     'goodness_fitting': 0.5969813037105616,
     'cSelectivity': 1.0,
     'parent_masstrack_id': 14971,
     'mz': 790.5562896728516,
     'snr': 1495,
     'id_number': 'F18472',
     'rtime': 31.7351988,
     'rtime_left_base': 29.370471000000002,
     'rtime_right_base': 33.837181799999996,
     'representative_intensity': 11069601,
     'id': 'F18472',
     'isotope': 'M0',
     'modification': 'M+H+',
     'ion_relation': 'M0,M+H+',
     'parent_epd_id': 'kp1253_789.551'},
    {'apex': 118,
     'peak_area': 9052198,
     'height': 741044,
     'left_base': 107,
     'right_base': 130,
     'goodness_fitting': 0.7809889324932886,
     'cSelectivity': 1.0,
     'parent_m

In [11]:
[x for x in list(epds.items())][0]

('kp1_84.0705',
 {'interim_id': 'kp1_84.0705',
  'neutral_formula_mass': 84.0704689970972,
  'neutral_formula': None,
  'Database_referred': [],
  'identity': [],
  'MS1_pseudo_Spectra': [{'apex': 240,
    'peak_area': 12452356,
    'height': 662709,
    'left_base': 228,
    'right_base': 252,
    'goodness_fitting': 0.9395043495299495,
    'cSelectivity': 1.0,
    'parent_masstrack_id': 45,
    'mz': 87.08445358276367,
    'snr': 662,
    'id_number': 'F120',
    'rtime': 63.264846,
    'rtime_left_base': 60.111869999999996,
    'rtime_right_base': 66.417822,
    'representative_intensity': 12452356,
    'id': 'F120',
    'isotope': '13C/12C*2',
    'modification': 'M+H+',
    'ion_relation': '13C/12C*2,M+H+',
    'parent_epd_id': 'kp1_84.0705'},
   {'apex': 240,
    'peak_area': 693023365,
    'height': 36754666,
    'left_base': 229,
    'right_base': 250,
    'goodness_fitting': 0.5182421267290913,
    'cSelectivity': 0.6875,
    'parent_masstrack_id': 22,
    'mz': 86.08110046386

In [12]:
neutrals = []
for k,v in epds.items():
    p = {}
    p['id'] = k    #  k may differ from v['interim_id']
    if v['neutral_formula_mass']:
        # print(v['interim_id'])
        p['mz'] = v['neutral_formula_mass']
        p['rtime'] = np.mean([x['rtime'] for x in v['MS1_pseudo_Spectra']])
        
        max_area = np.max([x['peak_area'] for x in v['MS1_pseudo_Spectra']])
        for pseudo_spectrum in v['MS1_pseudo_Spectra']:
            if pseudo_spectrum['peak_area'] == max_area:
                p['id_numbers'] = pseudo_spectrum['id_number']
                if 'ion_relation' in pseudo_spectrum.keys():
                    p['ion_relation'] = pseudo_spectrum['ion_relation']
        
        neutrals.append(p)
    
print(neutrals[22:24])

[{'id': 'kp23_115.0633', 'mz': 115.0632522362573, 'rtime': 42.80346457499999, 'id_numbers': 'F291', 'ion_relation': 'M0,M+H+'}, {'id': 'kp24_137.0451', 'mz': 137.04507306237429, 'rtime': 52.980143828571435, 'id_numbers': 'F292', 'ion_relation': 'undetermined'}]


In [13]:
target_df.head()

Unnamed: 0,Common Name,Systematic Name,Species Shorthand,Formula,H,[M-H],Mass
0,PS(12:0/13:0),1-dodecanoyl-2-tridecanoyl-sn-glycero-3-phosph...,PS 25:0,637.395487,1.007825,636.387662,C31H60NO10P
1,"PS(17:0/20:4(5Z,8Z,11Z,14Z))","1-heptadecanoyl, 2-(5Z,8Z,11Z,14Z-eicosatetrae...",PS 37:4,797.520687,1.007825,796.512862,C43H76NO10P
2,"PS(21:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))","1-heneicosanoyl-2-(4Z,7Z,10Z,13Z,16Z,19Z-docos...",PS 43:6,877.583287,1.007825,876.575462,C49H84NO10P
3,PS(17:0/14:1(9Z)),1-heptadecanoyl-2-(9Z-tetradecenoyl)-sn-glycer...,PS 31:1,719.473737,1.007825,718.465912,C37H70NO10P
4,PS(16:0/18:1(11Z)),1-hexadecanoyl-2-(11Z-octadecenoyl)-sn-glycero...,PS 34:1,761.520687,1.007825,760.512862,C40H76NO10P


In [14]:
sim_targeted_df = target_df.loc[:,['Species Shorthand','Formula','Mass']].drop_duplicates()
sim_targeted_df.columns = ['Species Shorthand','Mass','Formula']

In [15]:
sim_targeted_df.shape[0]

260

In [16]:
len(set(sim_targeted_df['Mass']))

252

-----

-----

-----

In [17]:
mztree = build_centurion_tree(neutrals)
sim_targeted_df.index = sim_targeted_df['Species Shorthand']

In [18]:
target_dict = sim_targeted_df.to_dict(orient="index")

In [19]:
list(target_dict.items())[0]

('PS 25:0',
 {'Species Shorthand': 'PS 25:0',
  'Mass': 637.395487,
  'Formula': 'C31H60NO10P'})

In [35]:
dict_PS = {}
for k,v in target_dict.items():
    match = find_all_matches_centurion_indexed_list(v['Mass'], mztree, 5)
    if match:
        v['matched'] = match
        v['id_numbers'] = [x['id_numbers'] for x in v['matched']]
        dict_PS.update({k:v})
        
print(len(dict_PS))

29


In [36]:
list(dict_PS.items())[0:2]

[('PS 36:1',
  {'Species Shorthand': 'PS 36:1',
   'Mass': 789.551987,
   'Formula': 'C42H80NO10P',
   'matched': [{'id': 'kp1253_789.551',
     'mz': 789.5509837399358,
     'rtime': 31.297289399999997,
     'id_numbers': 'F18893',
     'ion_relation': 'M0,Na/H'},
    {'id': '19115',
     'mz': 789.551984778,
     'rtime': 27.268622999999998,
     'id_numbers': 'F18471'},
    {'id': '19116',
     'mz': 789.551984778,
     'rtime': 36.727417800000005,
     'id_numbers': 'F18473'}],
   'id_numbers': ['F18893', 'F18471', 'F18473']}),
 ('PS 24:0',
  {'Species Shorthand': 'PS 24:0',
   'Mass': 623.379837,
   'Formula': 'C30H58NO10P',
   'matched': [{'id': '17538',
     'mz': 623.379834,
     'rtime': 25.692121800000002,
     'id_numbers': 'F14501'}],
   'id_numbers': ['F14501']})]

In [31]:
reshuffled_dict = {}

for k, v in dict_PS.items():
    for item in v['matched']:
        ion_relation = item.get('ion_relation') # .get() this can avoid empty of the ion_relation!
        reshuffled_dict[item['id_numbers']] = {
            'FTID': item['id_numbers'],
            'formula': v['Formula'],
            'neutral_mass': v['Mass'],
            'short_name': v['Species Shorthand'],
            'ion_relation': ion_relation,
            'interium_id': item['id']
        }

In [32]:
matched_df = pd.DataFrame(reshuffled_dict).transpose()

In [38]:
matched_df.head()

Unnamed: 0,FTID,formula,neutral_mass,short_name,ion_relation,interium_id
F18893,F18893,C42H80NO10P,789.551987,PS 36:1,"M0,Na/H",kp1253_789.551
F18471,F18471,C42H80NO10P,789.551987,PS 36:1,,19115
F18473,F18473,C42H80NO10P,789.551987,PS 36:1,,19116
F14501,F14501,C30H58NO10P,623.379837,PS 24:0,,17538
F17414,F17414,C46H78NO10P,835.536337,PS 40:6,"M0,M+H+",kp1134_835.5335


In [39]:
matched_df.to_csv(os.path.join(output_dir,"matched2targetPS.csv"),sep = ',',index = False)

-----

-----

-----

-----