In [10]:
import json
import intervaltree
import csv
import pandas as pd
import numpy as np

In [11]:
def read_CD(path):
    t = pd.read_csv(path)
    annotations = []
    primary_rtime = None
    primary_mz = None
    for name, mz, rtime, sub_name in zip(t["Name"], t["m/z"], t["RT [min]"], t["Annot. Source: mzVault Search"]):
        if name != "No results" and name != "Tags":
            if isinstance(name, str):
                primary_rtime = rtime
                primary_mz = mz
                annotations.append({"Name": name, "m/z": mz, "RT [min]": rtime})
            elif isinstance(sub_name, str) and sub_name != "No results":
                annotations.append({"Name": sub_name, "m/z": primary_mz, "RT [min]": primary_rtime})
    return annotations

In [12]:
def read_pcpfm(path):
    return json.load(open(path))

In [13]:
def map_CD_annots(cd_annots, pcpfm_annots, mz_ppm_tol=10, rt_err=30):
    mz_tree, rt_tree = intervaltree.IntervalTree(), intervaltree.IntervalTree()
    khipu_to_features, features_to_khipu = {}, {}
    for kp_id, kp in pcpfm_annots.items():
        for peak in kp["MS1_pseudo_Spectra"]:
            id = peak['id_number']
            features_to_khipu[id] = kp_id
            if kp_id in khipu_to_features:
                khipu_to_features[kp_id].append(id)
            else:
                khipu_to_features[kp_id] = [id]
            mz, rtime = peak['mz'], peak['rtime']
            mz_err = mz / 1e6 * mz_ppm_tol
            mz_tree.addi(mz - mz_err, mz + mz_err, id)
            rt_tree.addi(rtime - rt_err, rtime + rt_err, id)            
    
    for cd_annot in cd_annots:
        cd_annot['mapped_to'] = set()
        if cd_annot['Name']:
            mz = float(cd_annot['m/z'])
            rt = float(cd_annot['RT [min]']) * 60
            mz_matches = {x.data for x in mz_tree.at(mz)}
            rt_matches = {x.data for x in rt_tree.at(rt)}
            for match in mz_matches.intersection(rt_matches):
                for khipu_feature in khipu_to_features[features_to_khipu[match]]:
                    cd_annot['mapped_to'].add(khipu_feature)
    return cd_annots

In [16]:
def eval_overlap(cd_path, pcpfm_path):
    CD_annots = read_CD(cd_path)
    PCPFM_annots = read_pcpfm(pcpfm_path)
    CD_annots = map_CD_annots(CD_annots, PCPFM_annots, mz_ppm_tol=10)

    all_CD_annot_cpds = set()
    all_PCPFM_annot_cpds = set()

    for cd_annot in CD_annots:
        for feature in cd_annot["mapped_to"]:
            all_CD_annot_cpds.add(cd_annot["Name"] + "_" + feature)

    for kp_id, kp in PCPFM_annots.items():
        if "MS2_Spectra" in kp:
            for MS2 in kp["MS2_Spectra"]:
                all_annotations = [(annotation, annotation["msms_score"]) for annotation in MS2["annotations"] if annotation["annotation_level"] == "Level_2"]
                all_annotations = sorted(all_annotations, key=lambda x: -x[1])
                all_annotations = [x[0] for x in all_annotations]
                for annotation in all_annotations:
                    for peak in kp["MS1_pseudo_Spectra"]:
                        all_PCPFM_annot_cpds.add(annotation['reference_id'] + "_" + peak['id_number'])

    print("CD All:", len(all_CD_annot_cpds), "PCPFM All:", len(all_PCPFM_annot_cpds))
    CD_in_PCPFM = all_CD_annot_cpds.intersection(all_PCPFM_annot_cpds)
    print("Shared Annotations: ", len(CD_in_PCPFM))
    print("CD Only:", len(all_CD_annot_cpds) - len(CD_in_PCPFM))
    print("PCPFM Only:", len(all_PCPFM_annot_cpds) - len(CD_in_PCPFM))
    print(CD_in_PCPFM)

In [17]:
# RP NEG 
eval_overlap("./CD_MoNA_RP_neg.csv", "/Users/mitchjo/Analyses/HZV029_plasma_RP_neg/output/RP_neg_with_auth_stds_MS2_empCpds.json") # 30 sec
# copy these numbers to csv for venn diagram

CD All: 2515 PCPFM All: 3070
Shared Annotations:  1016
CD Only: 1499
PCPFM Only: 2054
{'D-(-)-Quinic acid_F3238_227.0324@28.91', '3-METHYLSALICYLIC ACID_F8909_151.0401@91.93', '3-HYDROXYBUTYRIC ACID_F2775_104.0434@57.01', 'GLUTAMINE_F8031_145.0619@26.59', 'trans-Aconitic acid_F1392_173.0092@36.83', '2-HYDROXYBUTYRIC ACID_F2725_103.0405@73.72', 'L-Tryptophan_F8490_203.0826@80.04', 'Succinic acid; LC-tDDA; CE40_F4001_118.0227@33.15', '3-Hydroxyphenylalanine_F3078_180.0667@54.36', 'Tauroallocholic acid_F24438_514.287@106.74', 'Succinic acid; LC-tDDA; CE10_F3882_117.0193@33.54', 'LPE 18:1_F24032_479.2994@179.26', 'Acesulfame_F10474_161.9868@79.21', 'ALA-GLY_F3406_181.0376@26.21', 'Salicyluric acid; LC-tDDA; CE10_F5589_236.0762@83.36', 'ARABINOSE_F8940_151.0521@28.91', '5-OXO-L-PROLINE_F5287_129.0387@51.3', 'D-Pantothenic acid_F1189_218.1034@76.69', 'cis-Aconitic Acid_F9776_209.9893@28.91', 'RIBOSE_F4421_185.0226@28.13', '(1S,3R,4S,5R)-1,3,4,5-tetrahydroxycyclohexane-1-carboxylic acid_F5918

In [18]:
# HILIC POS
eval_overlap("./CD_MoNA_HILIC_pos.csv", "/Users/mitchjo/Analyses/HZV029_plasma_HILIC_pos/output/HILIC_pos_with_auth_stds_MS2_empCpds.json")
# copy these numbers for venn diagram

CD All: 4000 PCPFM All: 5437
Shared Annotations:  1800
CD Only: 2200
PCPFM Only: 3637
{'Oleoyl-carnitine; AIF; CE30; MS2Dec_F25640_427.3616@79.61', 'INOSINE_F14569_291.0703@103.6', 'Paracetamol_F3618_153.074@21.16', 'Trazodone_F20207_372.1589@17.0', '4-Pyridoxate_F9418_206.0425@23.45', 'ADMA_F8359_203.1504@218.37', 'N_N-DIMETHYLARGININE_F8359_203.1504@218.37', 'Piperidine_F700_86.0968@98.14', 'D-TRYPTOPHAN_F3295_227.08@112.91', 'Allopurinol_F12277_138.0488@103.6', '7-Methylguanine; AIF; CE10; CorrDec_F8672_167.0758@76.56', 'Palmitoyl-carnitine; LC-tDDA; CE10_F28867_403.3534@81.14', 'Piperine_F13915_289.1551@17.0', '3-METHYLHISTIDINE_F9728_170.0925@207.21', '3-AMINO-5-HYDROXYBENZOIC ACID_F4479_156.057@24.94', '4-Hydroxypyridine; AIF; CE10; CorrDec_F1721_96.0448@31.5', 'Pyroglutamic acid_F3535_153.0357@173.12', '5-HYDROXYINDOLE_F11584_135.0634@18.5', '2-Oxindole; LC-tDDA; CE20_F10174_172.0157@20.02', 'Tryptophan_F9859_207.1039@113.66', '4-Aminophenol_F5136_111.0635@17.38', 'cis-1,2,3,6-T