# Do targeted search in asari output
- This is an example to perform targeted search in asari output feature table
- search for PS species

In [3]:
# !pip install -q --upgrade mass2chem khipu-metabolomics jms-metabolite-services

In [4]:
import json
import numpy as np
import pandas as pd

from mass2chem.search import build_centurion_tree, find_all_matches_centurion_indexed_list

In [5]:
import os
output_dir = '../../output/asari_wt_stem_cell/' # here I just used the original asari output
try:
    os.mkdir(output_dir)
except:
    None

In [6]:
annot_df = pd.read_csv("../../../raw_mzML/RPneg_mzML_wt_KlofCells/output_asari_project_32322133/Feature_annotation.tsv", sep = '\t')
target_df = pd.read_excel("../../../target_list/PS_target_list_to_Minghao_corr.xlsx")

In [7]:
target_df.head()

Unnamed: 0,Common Name,Systematic Name,Species Shorthand,Neutral_Mass,proton,[M-H]-,Formula
0,PS(12:0/13:0),1-dodecanoyl-2-tridecanoyl-sn-glycero-3-phosph...,PS 25:0,637.395487,1.007276,636.388211,C31H60NO10P
1,"PS(17:0/20:4(5Z,8Z,11Z,14Z))","1-heptadecanoyl, 2-(5Z,8Z,11Z,14Z-eicosatetrae...",PS 37:4,797.520687,1.007276,796.513411,C43H76NO10P
2,"PS(21:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))","1-heneicosanoyl-2-(4Z,7Z,10Z,13Z,16Z,19Z-docos...",PS 43:6,877.583287,1.007276,876.576011,C49H84NO10P
3,PS(17:0/14:1(9Z)),1-heptadecanoyl-2-(9Z-tetradecenoyl)-sn-glycer...,PS 31:1,719.473737,1.007276,718.466461,C37H70NO10P
4,PS(16:0/18:1(11Z)),1-hexadecanoyl-2-(11Z-octadecenoyl)-sn-glycero...,PS 34:1,761.520687,1.007276,760.513411,C40H76NO10P


In [8]:
annot_df.index = annot_df['[peak]id_number']

# Drop duplicate rows based on index
annot_df = annot_df.loc[~annot_df.index.duplicated(keep='first')]

# Convert selected columns to a dictionary with unique index as keys
features = [{'id':row['[peak]id_number'],
             'mz':row['mz'],
             'rt':row['rtime']} for i,row in annot_df.iterrows()]

In [9]:
features[0:2]

[{'id': 'F1590', 'mz': 371.02978515625, 'rt': 21.611541534},
 {'id': 'F4755', 'mz': 486.027961730957, 'rt': 19.48260084799998}]

In [10]:
target_df.head()

Unnamed: 0,Common Name,Systematic Name,Species Shorthand,Neutral_Mass,proton,[M-H]-,Formula
0,PS(12:0/13:0),1-dodecanoyl-2-tridecanoyl-sn-glycero-3-phosph...,PS 25:0,637.395487,1.007276,636.388211,C31H60NO10P
1,"PS(17:0/20:4(5Z,8Z,11Z,14Z))","1-heptadecanoyl, 2-(5Z,8Z,11Z,14Z-eicosatetrae...",PS 37:4,797.520687,1.007276,796.513411,C43H76NO10P
2,"PS(21:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z))","1-heneicosanoyl-2-(4Z,7Z,10Z,13Z,16Z,19Z-docos...",PS 43:6,877.583287,1.007276,876.576011,C49H84NO10P
3,PS(17:0/14:1(9Z)),1-heptadecanoyl-2-(9Z-tetradecenoyl)-sn-glycer...,PS 31:1,719.473737,1.007276,718.466461,C37H70NO10P
4,PS(16:0/18:1(11Z)),1-hexadecanoyl-2-(11Z-octadecenoyl)-sn-glycero...,PS 34:1,761.520687,1.007276,760.513411,C40H76NO10P


In [11]:
sim_targeted_df = target_df.loc[:,['Species Shorthand','Formula','Neutral_Mass','[M-H]-']].drop_duplicates()

In [12]:
sim_targeted_df.shape[0]

260

In [13]:
len(set(sim_targeted_df['[M-H]-']))

252

-----

-----

-----

In [14]:
mztree = build_centurion_tree(features)
sim_targeted_df.index = sim_targeted_df['Species Shorthand']

In [15]:
target_dict = sim_targeted_df.to_dict(orient="index")

In [16]:
list(target_dict.items())[0]

('PS 25:0',
 {'Species Shorthand': 'PS 25:0',
  'Formula': 'C31H60NO10P',
  'Neutral_Mass': 637.395487,
  '[M-H]-': 636.38821053323})

In [17]:
dict_PS = {}
for k,v in target_dict.items():
    match = find_all_matches_centurion_indexed_list(v['[M-H]-'], mztree, 5)
    if match:
        v['matched'] = match
        v['id_numbers'] = [x['id'] for x in v['matched']]
        dict_PS.update({k:v})
        
print(len(dict_PS))

113


In [18]:
# benchmarked PS species
[{k:v} for k,v in dict_PS.items() if 'PS 40:6' in k]

[{'PS 40:6': {'Species Shorthand': 'PS 40:6',
   'Formula': 'C46H78NO10P',
   'Neutral_Mass': 835.536337,
   '[M-H]-': 834.52906053323,
   'matched': [{'id': 'F15159',
     'mz': 834.5291442871094,
     'rt': 308.1190798069998},
    {'id': 'F15158', 'mz': 834.5291442871094, 'rt': 301.513494207},
    {'id': 'F15157', 'mz': 834.5291442871094, 'rt': 262.332738222},
    {'id': 'F15160', 'mz': 834.5291442871094, 'rt': 341.13437214299995},
    {'id': 'F15161', 'mz': 834.5291442871094, 'rt': 342.167908254}],
   'id_numbers': ['F15159', 'F15158', 'F15157', 'F15160', 'F15161']}}]

In [19]:
reshuffled_dict = {}

for k, v in dict_PS.items():
    for item in v['matched']:
        ion_relation = item.get('ion_relation') # .get() this can avoid empty of the ion_relation!
        reshuffled_dict[item['id']] = {
            'FTID': item['id'],
            'formula': v['Formula'],
            'neutral_mass': v['Neutral_Mass'],
            'short_name': v['Species Shorthand']
        }

In [20]:
matched_df = pd.DataFrame(reshuffled_dict).transpose()

In [21]:
matched_df.head()

Unnamed: 0,FTID,formula,neutral_mass,short_name
F10854,F10854,C43H76NO10P,797.520687,PS 37:4
F10852,F10852,C43H76NO10P,797.520687,PS 37:4
F10853,F10853,C43H76NO10P,797.520687,PS 37:4
F10855,F10855,C43H76NO10P,797.520687,PS 37:4
F14786,F14786,C49H84NO10P,877.583287,PS 43:6


In [22]:
matched_df.to_csv(os.path.join(output_dir,"stemCell_features_matched2targetPS_M-Honly.csv"),sep = ',',index = False)

-----

-----

In [37]:
# Scripts that
# ful_feat_df = pd.read_csv("../../../raw_mzML/RPneg_mzML_wt_KlofCells/output_asari_project_32322133/export/full_Feature_table.tsv", 
#                           sep = '\t',
#                           index_col = 0)

# meta_df = pd.read_csv("../../../seq/RPneg_addgrp.csv")

# name2id_dict ={row['File Name']:row['Sample ID'] for i,row in meta_df.iterrows()}

# new_col = [x if x not in name2id_dict else name2id_dict[x] for x in ful_feat_df.columns ]

# ful_feat_df.columns = new_col

# m_df = pd.merge(matched_df,ful_feat_df,left_index=True, right_index=True, how = "left")

# m_df.to_csv("../../output/asari_wt_stem_cell/PS_sel_renamed_stemCell_features_table.csv")