# Summary and Pruning annotations

In [1]:
import pandas as pd 
import json

In [2]:
with open("./MS_MS_signatureSearch/data/output_pooled_regDDA_in-silico/chainable_annotation_pooled_regDDA.json", "r") as f:
    annot_dict = json.load(f)

In [3]:
len(annot_dict)

1699

In [4]:
comp_feats = [k for k,v in annot_dict.items() 
     if "in-silico" in v['annotation_evidence'].keys() ]

In [5]:
lipidmaps_feats = [k for k,v in annot_dict.items() 
     if "LMSD" in v['annotation_evidence'].keys() ]

In [6]:
iso_feats = [k for k,v in annot_dict.items() 
     if "isotopes" in v['annotation_evidence'].keys() ]

In [7]:
ms2_feats = [k for k,v in annot_dict.items() 
     if "MS2" in v['annotation_evidence'].keys() and len(v['annotation_evidence']['MS2'][0]) > 0  ]

In [8]:
len(comp_feats)

1699

In [9]:
len(lipidmaps_feats)

277

In [10]:
len(iso_feats)

332

In [11]:
len(ms2_feats)

8

In [12]:
len(set(iso_feats).intersection(set(lipidmaps_feats))) # isotopolgues with lipidmaps are relatively less 

33

In [13]:
len(set(comp_feats).intersection(set(lipidmaps_feats))) # alllipidmaps also present in in-silico

277

In [14]:
len(set(iso_feats).intersection(set(comp_feats)))

332

# Get the final results into the output folder

In [15]:
import os
try:
    os.mkdir("../../output/annotations_MS1_MS2")
except:
    None

In [16]:
with open("../../output/annotations_MS1_MS2/chainable_annotation_final.json", "w") as f:
    json.dump(annot_dict,f, indent = 2)

### MS2 annotations involved multiple levels of annotations thus, create a json file with features only with MS2 annotations

In [17]:
annot_dict_ms2 = {k:v for k,v in annot_dict.items() if k in ms2_feats}
with open("../../output/annotations_MS1_MS2/chainable_annotation_withMS2matched.json", "w") as f:
    json.dump(annot_dict_ms2,f, indent = 2)

In [32]:
import copy

def construct_simplified_table(annot_dict,
                               feats2keep,
                               columns2exclude = ['formula_dict','hits_isotopologue_chain']):
    annot_dict_sim = copy.deepcopy(annot_dict)
    for k,v in annot_dict_sim.items():
        v['annotation_evidence'] = "|".join(v['annotation_evidence'].keys())
        v['isomers'] = "|".join(v['isomers'])
    new_df = pd.DataFrame({k:v for k,v in annot_dict_sim.items() if k in feats2keep}).transpose()
    new_df = new_df.drop(columns2exclude, axis = 1)
    return new_df

In [33]:
# MS2 might deserved to output them all in one instance
ms2_df = pd.DataFrame({k:v for k,v in annot_dict.items() if k in ms2_feats}).transpose()
ms2_df.to_csv("../../output/annotations_MS1_MS2/features_wtMS2_annotations.csv")

comp_sim_df = construct_simplified_table(annot_dict,comp_feats);comp_sim_df.to_csv("../../output/annotations_MS1_MS2/features_wtInSilicoPS_predicted_annot_sim.csv")
ms2_sim_df = construct_simplified_table(annot_dict,ms2_feats);ms2_sim_df.to_csv("../../output/annotations_MS1_MS2/features_wtMS2_annot_sim.csv")
iso_sim_df = construct_simplified_table(annot_dict,iso_feats);iso_sim_df.to_csv("../../output/annotations_MS1_MS2/features_wtC13isotope_annot_sim.csv")
lmsd_sim_df = construct_simplified_table(annot_dict,lipidmaps_feats);lmsd_sim_df.to_csv("../../output/annotations_MS1_MS2/features_wtLMSD_annot_sim.csv")

lmsd_wt_iso_df = construct_simplified_table(annot_dict,set(iso_feats).intersection(set(lipidmaps_feats)));
lmsd_wt_iso_df.to_csv("../../output/annotations_MS1_MS2/features_wtLMSD_C13isotope_annot_sim.csv")

-----

-----

-----

# Annotate PS into the feature table
- Unfortunately, in R there is no easy way to load these tables
- I will directly merge the annotation tables with the feature table
- Let's just go with `comp_sim_df`, which is more inclusive

In [37]:
df = pd.read_csv("../../output/hMinImp_TICnorm_groupFil0.3_RPneg_1wayANOVA/one_wayANOVA_hMinImp_TICnorm_groupFil0.3_RPneg_fullreport.csv",index_col =0)

In [42]:
df.shape

(8862, 36)

In [43]:
PS_select_df = pd.merge(comp_sim_df,df,left_index=True,right_index=True, how = "inner")

In [44]:
PS_label_df = pd.merge(comp_sim_df,df,left_index=True,right_index=True, how = "right")

In [45]:
PS_select_df.to_csv("../../output/annotations_MS1_MS2/PS_selected_full_report.csv")
PS_label_df.to_csv("../../output/annotations_MS1_MS2/PS_labeled_full_report.csv")