## Incorporate In-silico prediction & isotopologue search

In [1]:
import json
import pandas as pd

In [2]:
## missing the rtime range parameters
df_feat = pd.read_csv("../../../asari-output-RPneg/output_asari_project_31312361/export/full_Feature_table.tsv",
                      sep = '\t', index_col=0)

In [3]:
annot_dict = {}
for ftID,row in df_feat.iterrows():
    annot_dict.update({ftID:{}})
    annot_dict[ftID]['mz'] = row['mz']
    annot_dict[ftID]['rtime'] = row['rtime']
    annot_dict[ftID]['rtime_left_base'] = row['rtime_left_base']
    annot_dict[ftID]['rtime_right_base'] = row['rtime_right_base']
    annot_dict[ftID]['cSelectivity'] = row['cSelectivity']
    annot_dict[ftID]['goodness_fitting'] = row['goodness_fitting']
    annot_dict[ftID]['snr'] = row['snr']
    annot_dict[ftID]['annotation_evidence'] = {}

In [4]:
with open("../../reference/theoretical_PS_search_results_MG_3_24_2023.json") as f:
    PS2feature = json.load(f)

In [5]:
new_feat_list = []
iso_feat_list = []
for item in PS2feature:
    if len(item['mz_only_hits'])>0:
        new_feat_list.append(item)
        if 'hits_isotopologue_chain' in item:
            iso_feat_list.append(item)

In [6]:
new_feat_list[0]

{'name': 'PS 11:6',
 'neutral_mass': 429.08248284315,
 'formula': 'C17H20NO10P',
 'formula_dict': {'C': 17, 'H': 20, 'O': 10, 'N': 1, 'P': 1},
 'isomers': [],
 '[M-H+e]': 428.07520639082907,
 'mz_only_hits': ['F4086']}

In [7]:
feat_wt_matches_dict = {}
for item in new_feat_list:
    for featID in item['mz_only_hits']:
        temp_dict = {featID: {k:v for k,v in item.items() if 'mz_only_hits' not in k}}
        
        temp_dict[featID]['annotation_evidence'] = {'in-silico': "1"}
        
        feat_wt_matches_dict.update(temp_dict)

In [8]:
list(feat_wt_matches_dict.items())[1]

('F1406',
 {'name': 'PS 12:3',
  'neutral_mass': 449.14508310099,
  'formula': 'C18H28NO10P',
  'formula_dict': {'C': 18, 'H': 28, 'O': 10, 'N': 1, 'P': 1},
  'isomers': [],
  '[M-H+e]': 448.1378066486691,
  'annotation_evidence': {'in-silico': '1'}})

In [9]:
print(len(list(feat_wt_matches_dict.items())))

1144


In [10]:
feat_wt_isotopes_found = []
for item in iso_feat_list:
    for k,v in item['hits_isotopologue_chain'].items():
        feat_wt_isotopes_found.extend(v['0'])

In [11]:
for featID in feat_wt_isotopes_found:
    if featID in feat_wt_matches_dict:
        feat_wt_matches_dict[featID]['annotation_evidence']['isotopes'] = "1"
    else:
        print(featID)

In [12]:
for k,v in annot_dict.items():
    if k in feat_wt_matches_dict:
        annot_dict[k].update(feat_wt_matches_dict[k])

## incorporate lipidmaps experimental database

In [13]:
lipid_db_matched_df = pd.read_csv("../../../RPneg-lipidomics/output/preproc_fultab_lab_targlist_79_rmTn750K/features_matched2targetPS_M-Honly.csv",
                                 index_col=0)

In [14]:
list(annot_dict.items())[10003]

('F10004',
 {'mz': 796.5467,
  'rtime': 373.01,
  'rtime_left_base': 370.53,
  'rtime_right_base': 375.49,
  'cSelectivity': 0.7,
  'goodness_fitting': 0.89,
  'snr': 22.0,
  'annotation_evidence': {'in-silico': '1'},
  'name': 'LPS 38:4',
  'neutral_mass': 797.55707015738,
  'formula': 'C44H80NO9P',
  'formula_dict': {'C': 44, 'H': 80, 'O': 9, 'N': 1, 'P': 1},
  'isomers': ['PS O-38:4', 'LPS O-38:5;O'],
  '[M-H+e]': 796.5497937050591})

In [15]:
for ftID,row in lipid_db_matched_df.iterrows():
    annot_dict[ftID]['name'] = row['short_name']
    annot_dict[ftID]['neutral_mass'] = row['neutral_mass']
    annot_dict[ftID]['formula'] = row['formula']
    annot_dict[ftID]['annotation_evidence'].update({'LMSD':"1"})

In [16]:
PSonly_annot_dict = {}
for ftID,v in annot_dict.items():
    if len(v['annotation_evidence']) > 0:
        PSonly_annot_dict.update({ftID:v})

In [17]:
len(list(PSonly_annot_dict.items()))

1147

In [18]:
len([k for k,v in PSonly_annot_dict.items() if len(v['annotation_evidence'].keys()) == 3 ])

99

In [19]:
with open("./chainable_annotations.json", "w") as f:
    json.dump(PSonly_annot_dict, f, indent=2)

In [20]:
threeLevelConf_dict = {k:v for k,v in PSonly_annot_dict.items() if (len(v['annotation_evidence'].keys()) == 3) & (200 < v['rtime'] < 500)}

In [21]:
with open("./highConf_level4_chainable_annotations.json", "w") as f:
    json.dump(threeLevelConf_dict, f, indent=2)