# Do targeted search in asari output
- This is an example to perform targeted search in asari output feature table
- search for PS species

In [1]:
# !pip install -q --upgrade mass2chem khipu-metabolomics jms-metabolite-services

In [2]:
import json
import numpy as np
import pandas as pd

from mass2chem.search import build_centurion_tree, find_all_matches_centurion_indexed_list

In [3]:
import os
output_dir = '../../output/preproc_fultab_lab_targlist_79_rmTn750K/' # here I just used the original asari output
try:
    os.mkdir(output_dir)
except:
    None

In [4]:
annot_df = pd.read_csv("../../../asari-output-RPneg/output_asari_project_31312361/Feature_annotation.tsv", sep = '\t')
target_df = pd.read_excel("../../../target_list/PS_target_list_to_Minghao_corr.xlsx")

In [5]:
target_df.head()

Unnamed: 0,Common Name,Systematic Name,Species Shorthand,Neutral_Mass,proton,[M-H]-,Formula
0,PS(12:0/13:0),1-dodecanoyl-2-tridecanoyl-sn-glycero-3-phosph...,PS 25:0,637.395487,1.007276,636.388211,C31H60NO10P
1,"PS(17:0/20:4(5Z,8Z,11Z,14Z))","1-heptadecanoyl, 2-(5Z,8Z,11Z,14Z-eicosatetrae...",PS 37:4,797.520687,1.007276,796.513411,C43H76NO10P
2,"PS(21:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))","1-heneicosanoyl-2-(4Z,7Z,10Z,13Z,16Z,19Z-docos...",PS 43:6,877.583287,1.007276,876.576011,C49H84NO10P
3,PS(17:0/14:1(9Z)),1-heptadecanoyl-2-(9Z-tetradecenoyl)-sn-glycer...,PS 31:1,719.473737,1.007276,718.466461,C37H70NO10P
4,PS(16:0/18:1(11Z)),1-hexadecanoyl-2-(11Z-octadecenoyl)-sn-glycero...,PS 34:1,761.520687,1.007276,760.513411,C40H76NO10P


In [6]:
annot_df.index = annot_df['[peak]id_number']

# Drop duplicate rows based on index
annot_df = annot_df.loc[~annot_df.index.duplicated(keep='first')]

# Convert selected columns to a dictionary with unique index as keys
features = [{'id':row['[peak]id_number'],
             'mz':row['mz'],
             'rt':row['rtime']} for i,row in annot_df.iterrows()]

In [7]:
features[0:2]

[{'id': 'F22', 'mz': 505.1503296, 'rt': 140.6143289},
 {'id': 'F144', 'mz': 506.151741, 'rt': 140.6143289}]

In [8]:
target_df.head()

Unnamed: 0,Common Name,Systematic Name,Species Shorthand,Neutral_Mass,proton,[M-H]-,Formula
0,PS(12:0/13:0),1-dodecanoyl-2-tridecanoyl-sn-glycero-3-phosph...,PS 25:0,637.395487,1.007276,636.388211,C31H60NO10P
1,"PS(17:0/20:4(5Z,8Z,11Z,14Z))","1-heptadecanoyl, 2-(5Z,8Z,11Z,14Z-eicosatetrae...",PS 37:4,797.520687,1.007276,796.513411,C43H76NO10P
2,"PS(21:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))","1-heneicosanoyl-2-(4Z,7Z,10Z,13Z,16Z,19Z-docos...",PS 43:6,877.583287,1.007276,876.576011,C49H84NO10P
3,PS(17:0/14:1(9Z)),1-heptadecanoyl-2-(9Z-tetradecenoyl)-sn-glycer...,PS 31:1,719.473737,1.007276,718.466461,C37H70NO10P
4,PS(16:0/18:1(11Z)),1-hexadecanoyl-2-(11Z-octadecenoyl)-sn-glycero...,PS 34:1,761.520687,1.007276,760.513411,C40H76NO10P


In [9]:
sim_targeted_df = target_df.loc[:,['Species Shorthand','Formula','Neutral_Mass','[M-H]-']].drop_duplicates()

In [10]:
sim_targeted_df.shape[0]

260

In [11]:
len(set(sim_targeted_df['[M-H]-']))

252

-----

-----

-----

In [12]:
mztree = build_centurion_tree(features)
sim_targeted_df.index = sim_targeted_df['Species Shorthand']

In [13]:
target_dict = sim_targeted_df.to_dict(orient="index")

In [14]:
list(target_dict.items())[0]

('PS 25:0',
 {'Species Shorthand': 'PS 25:0',
  'Formula': 'C31H60NO10P',
  'Neutral_Mass': 637.395487,
  '[M-H]-': 636.38821053323})

In [15]:
dict_PS = {}
for k,v in target_dict.items():
    match = find_all_matches_centurion_indexed_list(v['[M-H]-'], mztree, 5)
    if match:
        v['matched'] = match
        v['id_numbers'] = [x['id'] for x in v['matched']]
        dict_PS.update({k:v})
        
print(len(dict_PS))

105


In [16]:
# benchmarked PS species
[{k:v} for k,v in dict_PS.items() if 'PS 40:6' in k]

[{'PS 40:6': {'Species Shorthand': 'PS 40:6',
   'Formula': 'C46H78NO10P',
   'Neutral_Mass': 835.536337,
   '[M-H]-': 834.52906053323,
   'matched': [{'id': 'F13468', 'mz': 834.5291901, 'rt': 262.3327382},
    {'id': 'F13469', 'mz': 834.5291901, 'rt': 301.5134942},
    {'id': 'F13470', 'mz': 834.5291901, 'rt': 308.1190798}],
   'id_numbers': ['F13468', 'F13469', 'F13470']}}]

In [17]:
reshuffled_dict = {}

for k, v in dict_PS.items():
    for item in v['matched']:
        ion_relation = item.get('ion_relation') # .get() this can avoid empty of the ion_relation!
        reshuffled_dict[item['id']] = {
            'FTID': item['id'],
            'formula': v['Formula'],
            'neutral_mass': v['Neutral_Mass'],
            'short_name': v['Species Shorthand']
        }

In [18]:
matched_df = pd.DataFrame(reshuffled_dict).transpose()

In [19]:
matched_df.head()

Unnamed: 0,FTID,formula,neutral_mass,short_name
F9996,F9996,C43H76NO10P,797.520687,PS 37:4
F9994,F9994,C43H76NO10P,797.520687,PS 37:4
F9995,F9995,C43H76NO10P,797.520687,PS 37:4
F9997,F9997,C43H76NO10P,797.520687,PS 37:4
F14243,F14243,C49H84NO10P,877.583287,PS 43:6


In [20]:
matched_df.to_csv(os.path.join(output_dir,"features_matched2targetPS_M-Honly.csv"),sep = ',',index = False)

-----

-----