In [1]:
import json
import intervaltree
import csv
import pandas as pd
import numpy as np

In [2]:
def read_CD(path):
    t = pd.read_csv(path)
    annotations = []
    primary_rtime = None
    primary_mz = None
    for name, mz, rtime, sub_name in zip(t["Name"], t["m/z"], t["RT [min]"], t["Annot. Source: mzVault Search"]):
        if name != "No results" and name != "Tags":
            if isinstance(name, str):
                primary_rtime = rtime
                primary_mz = mz
                annotations.append({"Name": name, "m/z": mz, "RT [min]": rtime})
            elif isinstance(sub_name, str) and sub_name != "No results":
                annotations.append({"Name": sub_name, "m/z": primary_mz, "RT [min]": primary_rtime})
    return annotations

In [3]:
def read_pcpfm(path):
    return json.load(open(path))

In [4]:
def map_CD_annots(cd_annots, pcpfm_annots, mz_ppm_tol=10, rt_err=30):
    mz_tree, rt_tree = intervaltree.IntervalTree(), intervaltree.IntervalTree()
    khipu_to_features, features_to_khipu = {}, {}
    for kp_id, kp in pcpfm_annots.items():
        for peak in kp["MS1_pseudo_Spectra"]:
            id = peak['id_number']
            features_to_khipu[id] = kp_id
            if kp_id in khipu_to_features:
                khipu_to_features[kp_id].append(id)
            else:
                khipu_to_features[kp_id] = [id]
            mz, rtime = peak['mz'], peak['rtime']
            mz_err = mz / 1e6 * mz_ppm_tol
            mz_tree.addi(mz - mz_err, mz + mz_err, id)
            rt_tree.addi(rtime - rt_err, rtime + rt_err, id)            
    
    for cd_annot in cd_annots:
        cd_annot['mapped_to'] = set()
        if cd_annot['Name']:
            mz = float(cd_annot['m/z'])
            rt = float(cd_annot['RT [min]']) * 60
            mz_matches = {x.data for x in mz_tree.at(mz)}
            rt_matches = {x.data for x in rt_tree.at(rt)}
            for match in mz_matches.intersection(rt_matches):
                for khipu_feature in khipu_to_features[features_to_khipu[match]]:
                    cd_annot['mapped_to'].add(khipu_feature)
    return cd_annots

In [5]:
def eval_overlap(cd_path, pcpfm_path):
    CD_annots = read_CD(cd_path)
    PCPFM_annots = read_pcpfm(pcpfm_path)
    CD_annots = map_CD_annots(CD_annots, PCPFM_annots, mz_ppm_tol=10)

    all_CD_annot_cpds = set()
    all_PCPFM_annot_cpds = set()

    for cd_annot in CD_annots:
        for feature in cd_annot["mapped_to"]:
            all_CD_annot_cpds.add(cd_annot["Name"] + "_" + feature)

    for kp_id, kp in PCPFM_annots.items():
        if "MS2_Spectra" in kp:
            for MS2 in kp["MS2_Spectra"]:
                all_annotations = [(annotation, annotation["msms_score"]) for annotation in MS2["annotations"]]
                all_annotations = sorted(all_annotations, key=lambda x: -x[1])
                all_annotations = [x[0] for x in all_annotations[:10]]
                for annotation in all_annotations:
                    for peak in kp["MS1_pseudo_Spectra"]:
                        all_PCPFM_annot_cpds.add(annotation['reference_id'] + "_" + peak['id_number'])

    print(len(all_CD_annot_cpds), len(all_PCPFM_annot_cpds))
    CD_in_PCPFM = all_CD_annot_cpds.intersection(all_PCPFM_annot_cpds)
    print("Shared Annotations: ", len(CD_in_PCPFM))
    print(CD_in_PCPFM)

In [8]:
# RP NEG 
eval_overlap("./CD_MoNA_RP_neg.csv", "./HZV029_RP_neg_MoNA_annotations.json") # 30 sec


2501 1540
Shared Annotations:  710
{'Glutamic acid_F8787', '1,3-Benzenedicarboxylic acid_F11193', 'Glycochenodeoxycholic acid; LC-tDDA; CE30_F22415', 'GLYCOCHENODEOXYCHOLIC ACID_F19836', 'Glycochenodeoxycholic acid; LC-tDDA; CE40_F24121', 'LPE 18:1_F23582', '3-phenyllactic acid_F8880', 'Glycochenodeoxycholic acid; LC-tDDA; CE30_F23858', 'DL-Glyceric acid_F2877', '(2-{[2-hexadecanamido-3-hydroxyoctadec-4-en-1-yl phosphono]oxy}ethyl)trimethylazanium_F32078', 'Glycochenodeoxycholic acid; LC-tDDA; CE30_F19836', 'Succinic acid_F3870', 'Quinic acid_F3742', 'Bilirubin_F28120', 'Xylose_F8849', 'Quinic acid_F5865', 'Xylose_F4415', 'Terephthalic acid_F11193', 'Aconitic Acid_F1382', '5-OXO-D-PROLINE_F5271', 'Acetaminophen-glucuronide; LC-tDDA; CE40_F16952', '(R)-2-hydroxystearic acid_F12929', 'Methylmalonic acid; LC-tDDA; CE20_F3974', 'LACTIC ACID_F661', '5-OXO-D-PROLINE_F5277', '(R)-2-hydroxystearic acid_F16936', 'TAUROCHOLIC ACID_F24523', 'Acetaminophen glucuronide_F13957', 'TRANS-ACONITATE_F16

In [7]:
# HILIC POS
eval_overlap("./CD_MoNA_HILIC_pos.csv", "./HZV029_HILIC_pos_MoNA_annotations.json")


3998 3490
Shared Annotations:  1455
{'Linoleyl-carnitine; AIF; CE30; CorrDec_F24748', 'Palmitoyl-carnitine; LC-tDDA; CE20_F28290', 'PROLINE_F6805', 'Omeprazole; AIF; CE10; CorrDec_F18004', 'cis-1,2,3,6-Tetrahydrophthalimide_F3584', 'L-Tryptophan_F9538', 'Pyroglutamic acid_F10702', '2-Aminophenol_F3360', 'Oleoyl-L-Carnitine_F25461', '4-Hydroxymandelonitrile_F2943', '2-HYDROXYPYRIDINE_F1699', 'N-Methylproline_F9946', 'Phenylacetylglutamine; AIF; CE30; CorrDec_F13541', 'Oleoyl-carnitine; LC-tDDA; CE30_F26294', 'THEOPHYLLINE_F607', 'PHENYLALANINE_F3793', '3-Methylhistidine (v7)_F10239', 'Palmitoyl-L-Carnitine_F24739', 'N-Methylproline_F9964', 'BILIVERDIN_F38987', 'Biliverdin_F38987', 'CREATININE_F3492', 'N_N-DIMETHYLARGININE_F8397', 'Glycoursodeoxycholic acid_F23390', 'GUANIDINEACETIC ACID_F7684', 'Phenylacetylglutamine; AIF; CE30; CorrDec_F15743', 'lenticin_F10309', '3-Pyridinol_F1691', 'N6-Methyl-lysine; AIF; CE30; MS2Dec_F6231', '3-HYDROXYANTHRANILIC ACID_F4443', '3-Methylhistidine_F986