# MS1 annotations
- The script for in-silico prediction & identifying isotopolgues was kindly provided by Joshua Mitchell
- 03272023

## Run th search

In [126]:
!python3  PS_Lipid_Generator_and_Search.py ../../../asari-output-RPneg/output_asari_project_32322133_wtStemCell/export/full_Feature_table.tsv \
../../../target_list/PS-target-list-singular-M-H_corr.csv

##### Summary: This will generate json output that provides in-silico results & isotopologue results

## Incorporate In-silico prediction & isotopologue search

In [127]:
!pip install -q --upgrade mass2chem khipu-metabolomics jms-metabolite-services

In [128]:
import json
import numpy as np
import pandas as pd

from mass2chem.search import build_centurion_tree, find_all_matches_centurion_indexed_list

In [129]:
## missing the rtime range parameters
df_feat = pd.read_csv("../../../asari-output-RPneg/output_asari_project_32322133_wtStemCell/export/full_Feature_table.tsv",
                      sep = '\t', index_col=0)

In [130]:
annot_dict = {}
for ftID,row in df_feat.iterrows():
    annot_dict.update({ftID:{}})
    annot_dict[ftID]['mz'] = row['mz']
    annot_dict[ftID]['rtime'] = row['rtime']
    annot_dict[ftID]['rtime_left_base'] = row['rtime_left_base']
    annot_dict[ftID]['rtime_right_base'] = row['rtime_right_base']
    annot_dict[ftID]['cSelectivity'] = row['cSelectivity']
    annot_dict[ftID]['goodness_fitting'] = row['goodness_fitting']
    annot_dict[ftID]['snr'] = row['snr']
    annot_dict[ftID]['annotation_evidence'] = {}

In [131]:
with open("./theoretical_PS_search_results_MG_3_27_2023_v2.json") as f:
    PS2feature = json.load(f)

In [132]:
new_feat_list = []
iso_feat_list = []
for item in PS2feature:
    if len(item['mz_only_hits'])>0:
        new_feat_list.append(item)
        if 'hits_isotopologue_chain' in item:
            iso_feat_list.append(item)

In [133]:
new_feat_list[0]

{'name': 'PS 11:6',
 'neutral_mass': 429.08248284315,
 'formula': 'C17H20NO10P',
 'formula_dict': {'C': 17, 'H': 20, 'O': 10, 'N': 1, 'P': 1},
 'isomers': [],
 '[M-H+e]': 428.07520639082907,
 'mz_only_hits': ['F1376']}

In [134]:
feat_wt_matches_dict = {}
for item in new_feat_list:
    for featID in item['mz_only_hits']:
        temp_dict = {featID: {k:v for k,v in item.items() if 'mz_only_hits' not in k}}
        
        temp_dict[featID]['annotation_evidence'] = {'in-silico': "1"}
        
        feat_wt_matches_dict.update(temp_dict)

In [135]:
list(feat_wt_matches_dict.items())[1]

('F3794',
 {'name': 'PS 12:3',
  'neutral_mass': 449.14508310099,
  'formula': 'C18H28NO10P',
  'formula_dict': {'C': 18, 'H': 28, 'O': 10, 'N': 1, 'P': 1},
  'isomers': [],
  '[M-H+e]': 448.1378066486691,
  'annotation_evidence': {'in-silico': '1'}})

In [136]:
print(len(list(feat_wt_matches_dict.items())))

1699


In [137]:
feat_wt_isotopes_found = []
for item in iso_feat_list:
    for featID,v in item['hits_isotopologue_chain'].items():
        feat_wt_matches_dict[featID]['annotation_evidence']['isotopes'] = \
                len(v.keys())-1

In [138]:
for k,v in annot_dict.items():
    if k in feat_wt_matches_dict:
        annot_dict[k].update(feat_wt_matches_dict[k])

## incorporate lipidmaps experimental database

In [139]:
annot_df = pd.read_csv("../../../asari-output-RPneg/output_asari_project_32322133_wtStemCell/Feature_annotation.tsv", sep = '\t')
target_df = pd.read_excel("../../../target_list/PS_target_list_to_Minghao_corr.xlsx")

In [140]:
annot_df.index = annot_df['[peak]id_number']

# Drop duplicate rows based on index
annot_df = annot_df.loc[~annot_df.index.duplicated(keep='first')]

# Convert selected columns to a dictionary with unique index as keys
features = [{'id':row['[peak]id_number'],
             'mz':row['mz'],
             'rt':row['rtime']} for i,row in annot_df.iterrows()]

In [141]:
features[0:2]

[{'id': 'F1590', 'mz': 371.02978515625, 'rt': 21.611541534},
 {'id': 'F4755', 'mz': 486.027961730957, 'rt': 19.48260084799998}]

In [142]:
sim_targeted_df = target_df.loc[:,['Species Shorthand','Formula','Neutral_Mass','[M-H]-']].drop_duplicates()

In [143]:
sim_targeted_df.shape[0]

260

In [144]:
len(set(sim_targeted_df['[M-H]-']))

252

In [145]:
mztree = build_centurion_tree(features)
sim_targeted_df.index = sim_targeted_df['Species Shorthand']

In [146]:
target_dict = sim_targeted_df.to_dict(orient="index")

In [147]:
list(target_dict.items())[0]

('PS 25:0',
 {'Species Shorthand': 'PS 25:0',
  'Formula': 'C31H60NO10P',
  'Neutral_Mass': 637.395487,
  '[M-H]-': 636.38821053323})

In [148]:
dict_PS = {}
for k,v in target_dict.items():
    match = find_all_matches_centurion_indexed_list(v['[M-H]-'], mztree, 5)
    if match:
        v['matched'] = match
        v['id_numbers'] = [x['id'] for x in v['matched']]
        dict_PS.update({k:v})
        
print(len(dict_PS))

113


In [149]:
# benchmarked PS species
[{k:v} for k,v in dict_PS.items() if 'PS 40:6' in k]

[{'PS 40:6': {'Species Shorthand': 'PS 40:6',
   'Formula': 'C46H78NO10P',
   'Neutral_Mass': 835.536337,
   '[M-H]-': 834.52906053323,
   'matched': [{'id': 'F15159',
     'mz': 834.5291442871094,
     'rt': 308.1190798069998},
    {'id': 'F15158', 'mz': 834.5291442871094, 'rt': 301.513494207},
    {'id': 'F15157', 'mz': 834.5291442871094, 'rt': 262.332738222},
    {'id': 'F15160', 'mz': 834.5291442871094, 'rt': 341.13437214299995},
    {'id': 'F15161', 'mz': 834.5291442871094, 'rt': 342.167908254}],
   'id_numbers': ['F15159', 'F15158', 'F15157', 'F15160', 'F15161']}}]

In [150]:
reshuffled_dict = {}

for k, v in dict_PS.items():
    for item in v['matched']:
        ion_relation = item.get('ion_relation') # .get() this can avoid empty of the ion_relation!
        reshuffled_dict[item['id']] = {
            'FTID': item['id'],
            'formula': v['Formula'],
            'neutral_mass': v['Neutral_Mass'],
            'short_name': v['Species Shorthand']
        }

In [151]:
lipid_db_matched_df = pd.DataFrame(reshuffled_dict).transpose()

In [171]:
lipid_db_matched_df.shape

(277, 4)

In [152]:
list(annot_dict.items())[10003]

('F10004',
 {'mz': 788.5237,
  'rtime': 300.27,
  'rtime_left_base': 297.68,
  'rtime_right_base': 302.55,
  'cSelectivity': 1.0,
  'goodness_fitting': 0.96,
  'snr': 1060.0,
  'annotation_evidence': {'in-silico': '1', 'isotopes': 2},
  'name': 'LPS O-39:8',
  'neutral_mass': 789.53085540889,
  'formula': 'C45H76NO8P',
  'formula_dict': {'C': 45, 'H': 76, 'O': 8, 'N': 1, 'P': 1},
  'isomers': [],
  '[M-H+e]': 788.523578956569,
  'hits_isotopologue_chain': {'F10004': {'0': ['F10004'],
    '1': ['F10115'],
    '2': ['F10183']}}})

In [153]:
for ftID,row in lipid_db_matched_df.iterrows():
    annot_dict[ftID]['name'] = row['short_name']
    annot_dict[ftID]['neutral_mass'] = row['neutral_mass']
    annot_dict[ftID]['formula'] = row['formula']
    annot_dict[ftID]['annotation_evidence'].update({'LMSD':"1"})

In [154]:
PSonly_annot_dict = {}
for ftID,v in annot_dict.items():
    if len(v['annotation_evidence']) > 0:
        PSonly_annot_dict.update({ftID:v})

In [155]:
len(list(PSonly_annot_dict.items()))

1699

In [156]:
# check for 3-level annotations
len([k for k,v in PSonly_annot_dict.items() if len(v['annotation_evidence'].keys()) == 3 ])

33

In [157]:
with open("./chainable_annotations_v2.json", "w") as f:
    json.dump(PSonly_annot_dict, f, indent=2)

-----

-----

-----

# Check level 1 annotation statistics

In [186]:
comp_feats = [k for k,v in PSonly_annot_dict.items() 
     if "in-silico" in v['annotation_evidence'].keys() ]

In [187]:
lipidmaps_feats = [k for k,v in PSonly_annot_dict.items() 
     if "LMSD" in v['annotation_evidence'].keys() ]

In [188]:
iso_feats = [k for k,v in PSonly_annot_dict.items() 
     if "isotopes" in v['annotation_evidence'].keys() ]

In [189]:
len(comp_feats)

1699

In [190]:
len(lipidmaps_feats)

277

In [191]:
len(iso_feats)

332

In [192]:
len(set(iso_feats).intersection(set(lipidmaps_feats)))

33

In [193]:
len(set(comp_feats).intersection(set(lipidmaps_feats)))

277

In [194]:
len(set(iso_feats).intersection(set(comp_feats)))

332