# Get stats on ToF datasets

This notebook explains the delta values within an RT window via isotopologues, adducts and fragments.

- How many are explained by probable fragments; num of features and khipus.
- How many are explained by MS2 (top 5 spectra per cpd in MoNA)


In [4]:
import io
import os
import sys
import json
import contextlib
from matchms.importing import load_from_msp
from khipu.extended import peaklist_to_khipu_list, export_empCpd_khipu_list
from mass2chem.search import build_centurion_tree, find_all_matches_centurion_indexed_list

sys.path.insert(0, '..')
from mining import * 
from isf_helper import (extract_ms2_spectrum, 
                        get_comprehensive_stats_per_dataset, 
                        explain_a_dataset_by_mz_deltas, 
                        explain_a_dataset_byMS2)

## Isotopologue and Adducts from Step2

In [5]:
# pos
isp_pos = [ (1.003355, '13C/12C', (0, 0.8)),
                            (2.00671, '13C/12C*2', (0, 0.8)),
                            # (3.9948, '44Ca/40Ca', (0, 0.1)), # 2%
                            (1.9970, '37Cl/35Cl', (0.1, 0.8)), # 24.24%
                            ]

asp_pos = [  # initial patterns are relative to M+H+
                            (21.98194, 'Na/H'),
                            (17.0265, 'NH3'),
                            (41.026549, 'ACN'),     # Acetonitrile
                            (67.987424, 'NaCOOH'),
                            (32.026215, 'MeOH')
                            ]

In [6]:
# neg 
isp_neg = [ (1.003355, '13C/12C', (0, 0.8)),
                            (2.00671, '13C/12C*2', (0, 0.8)),
                            (1.9970, '37Cl/35Cl', (0.1, 0.8)), # 24.24%
                            (1.9958, '32S/34S', (0, 0.1)), # 4%
                            ]

asp_neg = [  # initial patterns are relative to M+H+
                            (21.98194, 'Na/H'), 
                            (67.987424, 'NaCOOH'),
                            (135.974848, 'NaCOOH*2'),
                            (82.0030, 'NaCH2COOH'),
                            # (1.99566, 'F <-> OH'), 
                            ]

## Loading MoNA MS2

In [7]:
import tqdm
import logging
logging.getLogger("matchms").setLevel(logging.ERROR)

In [36]:
def load_from_mona(path):
    '''read from mona ms2 file from a given path
    
    path: the path to mona .msp file
    
    return: the inchikey-spectra pair.
    '''
    # reused from JM
    spectral_registry = {}
    total = 0
    for x in tqdm.tqdm(load_from_msp(path)):
        try:
            inchikey = x.metadata_dict()['inchikey']
            if inchikey:
                if inchikey not in spectral_registry:
                    spectral_registry[inchikey] = []
                spectral_registry[inchikey].append(x)
                total += 1
        except:
            pass

    print("MS2 #: ", str(len(spectral_registry)), " Compounds with ", str(total), " MS2 Spectra")
    return spectral_registry


In [37]:
spectral_registry_pos = load_from_mona('../MoNA_MS2/filtered_MoNA-export-LC-MS-MS_Positive_Mode.msp')
spectral_registry_neg = load_from_mona('../MoNA_MS2/filtered_MoNA-export-LC-MS-MS_Negative_Mode.msp')

13973it [00:05, 2530.72it/s]


MS2 #:  13973  Compounds with  13973  MS2 Spectra


9184it [00:03, 2848.84it/s]

MS2 #:  9184  Compounds with  9184  MS2 Spectra





In [38]:
# ms2List is usable MoNA MS/MS compounds
ms2List_pos, no_precursor_pos = [], []
for sp in spectral_registry_pos.values(): 
    try:
        ms2List_pos.append(extract_ms2_spectrum(sp[0])) 
    except KeyError:
        no_precursor_pos.append(sp[0])
        
print(f'{len(ms2List_pos)} spectra are found with precursors.')

ms2List_neg, no_precursor_neg = [], []
for sp in spectral_registry_neg.values(): 
    try:
        ms2List_neg.append(extract_ms2_spectrum(sp[0])) 
    except KeyError:
        no_precursor_neg.append(sp[0])
        
print(f'{len(ms2List_neg)} spectra are found with precursors.')

13670 spectra are found with precursors.
9074 spectra are found with precursors.


In [39]:
ms2_tree_pos = build_centurion_tree([{'mz': x[0], 'rtime': 0, 'name': x[1], 'peaks': x[2]} for x in ms2List_pos])
ms2_tree_neg = build_centurion_tree([{'mz': x[0], 'rtime': 0, 'name': x[1], 'peaks': x[2]} for x in ms2List_neg])

## Runinng Batches

### Preparation

In [None]:
tof_datasets = [x.rstrip() for x in open('selected_16_tof_datasets.txt').readlines()]
pos_tof_datasets = [x for x in tof_datasets if 'pos' in x]
neg_tof_datasets = [x for x in tof_datasets if 'neg' in x]

In [None]:
dict_rtwindow = {}
for line in open('elution_parameters_16studies_tof.tsv').readlines()[1:]:
    a = line.rstrip().split('\t')
    dict_rtwindow[a[0]] = float(a[5])

### Positive

In [2]:
pos_candidate_fragments = '''18.01	113	18.010565	water	{'H': 2, 'O': 1}
14.015	83	14.015649	addition of acetic acid and loss of CO2. Reaction: (+C2H2O2) and (-CO2)	{'C': 1, 'H': 2}
2.015	80	2.014552	2H	{'H': 2}
28.0305	59	28.0313	± C2H4, natural alkane chains such as fatty acids	{'C': 2, 'H': 4}
46.0055	54	46.00548	± CO+H2O (carboxylic acid)	{'C': 1, 'O': 2, 'H': 2}
17.0265	53	17.0265	addition of ammonia. Reaction: (+NH3)	{'N': 1, 'H': 3}
11.9995	45	12.0	methylation and reduction	{'C': 1}
44.0255	41	44.0262	hydroxyethylation	{'C': 2, 'H': 4, 'O': 1}
26.015	41	26.01565	acetylation and loss of oxygen. Reaction: (+C2H2O) and (-O)	{'C': 2, 'H': 2}
15.995	40	15.99492	± O, e.g. oxidation/reduction	{'O': 1}
16.031	39	16.0313	Methylation + reduction	{'C': 1, 'H': 4}
32.026	37	32.026215	MeOH	{'C': 1, 'H': 4, 'O': 1}
39.993	37	39.9925	extra OH sodium adduct	{'H': 1, 'O': 1}
27.9945	37	27.9949	addition of CO. Reaction: (+CO)	{'C': 1, 'O': 1}
23.999	36	24.0	acetylation and loss of water. Reaction: (+C2H2O) and (-H2O)	{'C': 2}
42.0465	35	42.04695	± C3H6, propylation	{'C': 3, 'H': 6}
9.984	32	9.98435	addition of CO and loss of water. Reaction: (+CO) and (-H2O)	{'C': 1, 'H': -2}
30.0105	31	30.010564	addition of acetic acid and loss of CO. Reaction: (+C2H2O2) and (-CO)	{'C': 1, 'H': 2, 'O': 1}
56.0625	30	56.0626	± C4H8, butylation	{'C': 4, 'H': 8}
13.979	30	13.979264	nitrification and loss of oxygen. Reaction: (NH2 -> NO2) and (-O)	{'H': -2, 'O': 1}
'''
pos_candidate_fragments = [
    (float(x.split()[0]), x) for x in pos_candidate_fragments.splitlines()
]
pos_isf_candidate_fragments = [x[0] for x in pos_candidate_fragments]
len(pos_candidate_fragments), pos_candidate_fragments[3]

(14,
 (28.0312,
  '28.0312\t550\t28.0312\t[\'28.0313\', \'± C2H4, natural alkane chains such as fatty acids\', "{\'C\': 2, \'H\': 4}"]'))

In [None]:
tally_pos = []
for study in pos_tof_datasets:
    list_khipus, all_assigned_fids, list_features = get_comprehensive_stats_per_dataset(
        f'../input_data_tof/{study}/full_feature_table.tsv', 
        dict_rtwindow[study], 
        isp_pos,
        asp_pos,
        'pos')
    
    # sort to make sure we are getting in-source fragments
    remaining_features = [f for f in list_features if f['id'] not in all_assigned_fids]
        
    list_khipus = sorted(list_khipus, key=lambda x: x['neutral_formula_mass'], reverse=True)
    
    # by isf_candidate_fragments
    explained_khipu_ids, explained_feature_ids, delta_values_used = explain_a_dataset_by_mz_deltas(
        list_khipus, remaining_features, pos_isf_candidate_fragments, 
        rt_stdev=dict_rtwindow[study]
        )
    
    # # by MoNA MS2
    have_precursors, matched2 = explain_a_dataset_byMS2(
        list_features, ms2_tree_pos, rt_stdev=dict_rtwindow[study])
    delta_values_ms2 = []
    for x in matched2:
        delta_values_ms2 += x[2]

    tally_pos.append(
        {
            'study': study,
            'num_khipus': len(list_khipus),
            'num_features':  len(list_features),
            'mzdelta_explained_khipus': len(set(explained_khipu_ids)), 
            'mzdelta_explained_features': len(set(explained_feature_ids)),
            'freq_delta_values_used': delta_values_used,
            'have_precursors': len(have_precursors),
            'ms2_explained_features': len(matched2),
            'delta_values_ms2': delta_values_ms2,
        }
    )

In [46]:
with open('isfExplained_result_tally_pos.json', 'w', encoding='utf-8') as f:
    json.dump(tally_pos, f,  ensure_ascii=False, indent=2) 

The above code is a bit slow runinng in jupyter notebook. Modified it to run_stats_by_batches_pos.py, and then assemble the result below.

In [8]:
tally_pos = []
for pos_res in os.listdir('output/tof/01292025/pos'):
    tally_pos.append(json.load(open('output/tof/01292025/pos/' + pos_res, 'r')))
print(len(tally_pos))
with open('isfExplained_result_tally_pos_tof_01292025.json', 'w', encoding='utf-8') as f:
    json.dump(tally_pos, f,  ensure_ascii=False, indent=2) 

10


### Negative

In [3]:
# selected from top 20

neg_candidate_fragments = '''67.9875	140	67.987424	NaCOOH	"{'C': 1, 'O': 2, 'Na': 1, 'H': 1}"
2.015	70	2.014552	2H	{'H': 2}
135.974	49	135.974848	2X NaCOOH	"{'C': 2, 'O': 4, 'H': 2, 'Na': 2}"
82.002	48	82.003035	methylation and addition of trifluoromethyl. Reaction: (+CH2) and (+CF3-H)	"{'C': 2, 'H': 1, 'F': 3}"
1.011	41	1.007276467	1H	{'H':1 }
0.996	39	0.996585	addition of Guanine and loss of D-ribose. Reaction: (+C5H3N5) and (-C5H8O4)	"{'H': -5, 'N': 5, 'O': -4}"
43.989	35	43.9898	addition of CO2. Reaction: (+CO2)	"{'C': 1, 'O': 2}"
14.015	32	14.015649	addition of acetic acid and loss of CO2. Reaction: (+C2H2O2) and (-CO2)	"{'C': 1, 'H': 2}"
46.005	31	46.005305	addition of Phosphate and dechlorination. Reaction: (+HPO3) and (-Cl+H)	"{'H': 2, 'O': 3, 'P': 1, 'Cl': -1}"
26.015	29	26.01565	acetylation and loss of oxygen. Reaction: (+C2H2O) and (-O)	"{'C': 2, 'H': 2}"
61.97	29	61.975755	addition of Phosphate and defluorination. Reaction: (+HPO3) and (-F+H)	"{'H': 2, 'O': 3, 'P': 1, 'F': -1}"
129.957	29	129.958482	addition of di-phosphate and denitrification. Reaction: (+H2P2O6) and (NO2 -> NH2)	"{'H': 4, 'O': 4, 'P': 2}"
60.021	29	-60.0211	desmolysis	"{'C': -2, 'H': -4, 'O': -2}"
74.0365	28	74.03678	propionylation	"{'C': 3, 'H': 6, 'O': 2}"
23.9995	27	24	acetylation and loss of water. Reaction: (+C2H2O) and (-H2O)	{'C': 2}
10.029	27	10.028802	addition of CO2 and dechlorination. Reaction: (+CO2) and (-Cl+H)	"{'C': 1, 'H': 1, 'O': 2, 'Cl': -1}"
44.026	26	44.0262	hydroxyethylation	"{'C': 2, 'H': 4, 'O': 1}"
18.0105	26	18.010565	water	"{'H': 2, 'O': 1}"
6.0165	24	6.010565	addition of tiglyl and loss of phenyl. Reaction: (+C5H6O) and (-C6H5+H)	"{'C': -1, 'H': 2, 'O': 1}"
15.994	23	15.9949	oxidation	{'O': 1}
'''
neg_candidate_fragments = [
    (float(x.split()[0]), x) for x in neg_candidate_fragments.splitlines()
]

neg_isf_candidate_fragments = [x[0] for x in neg_candidate_fragments]
len(neg_candidate_fragments), neg_candidate_fragments[3]

(13, (82.0029, "82.0029\t401\t82.0029\t['82.0030 C2HF3']"))

In [None]:
tally_neg = []
for study in neg_tof_datasets:
    list_khipus, all_assigned_fids, list_features = get_comprehensive_stats_per_dataset(
        f'../input_data_tof/{study}/full_feature_table.tsv', 
        dict_rtwindow[study], 
        isp_neg,
        asp_neg,
        'neg')
    
    # sort to make sure we are getting in-source fragments
    remaining_features = [f for f in list_features if f['id'] not in all_assigned_fids]

    list_khipus = sorted(list_khipus, key=lambda x: x['neutral_formula_mass'], reverse=True)
    
    # by isf_candidate_fragments
    explained_khipu_ids, explained_feature_ids, delta_values_used = explain_a_dataset_by_mz_deltas(
        list_khipus, remaining_features, neg_isf_candidate_fragments, 
        rt_stdev=dict_rtwindow[study]
        )
    
    # # by MoNA MS2
    have_precursors, matched2 = explain_a_dataset_byMS2(
        list_features, ms2_tree_neg, rt_stdev=dict_rtwindow[study])
    delta_values_ms2 = []
    for x in matched2:
        delta_values_ms2 += x[2]

    tally_neg.append(
        {
            'study': study,
            'num_khipus': len(list_khipus),
            'num_features':  len(list_features),
            'mzdelta_explained_khipus': len(set(explained_khipu_ids)), 
            'mzdelta_explained_features': len(set(explained_feature_ids)),
            'freq_delta_values_used': delta_values_used,
            'have_precursors': len(have_precursors),
            'ms2_explained_features': len(matched2),
            'delta_values_ms2': delta_values_ms2,
        }
    )

In [64]:
with open('isfExplained_result_tally_neg.json', 'w', encoding='utf-8') as f:
    json.dump(tally_neg, f,  ensure_ascii=False, indent=2) 

The above code is a bit slow runinng in jupyter notebook. Modified it to run_stats_by_batches_neg.py, and then assemble the result below.

In [9]:
tally_neg = []
for neg_res in os.listdir('output/tof/01292025/neg'):
    tally_neg.append(json.load(open('output/tof/01292025/neg/' + neg_res, 'r')))
print(len(tally_neg))
with open('isfExplained_result_tally_neg_tof_01292025.json', 'w', encoding='utf-8') as f:
    json.dump(tally_neg, f,  ensure_ascii=False, indent=2) 

6


## Summary

In [4]:
for x in tally_pos + tally_neg:
    print(x['study'], '\n', x['num_features'], x['mzdelta_explained_features'], x['ms2_explained_features'])

ST002826_HILICpos__ppm25_1031121621 
 4400 28 149
MTBLS1133_RPpos_B2_ppm25_103113407 
 48168 108 174
ST000726_RPpos__ppm25_103113299 
 37840 120 333
ST001217_HILICpos__ppm25_103111503 
 7489 27 70
MTBLS718_RPpos_LPOS_B3_ppm25_1031121232 
 20974 46 69
ST002711_HILICpos_467535 Lipkin posHILIC 6530b_B7_ppm25_1031114314 
 8015 41 84
MTBLS718_HILICpos_HPOS_B1_ppm25_1031115857 
 41783 210 407
ST000046_HILICpos__ppm25_1031113822 
 4173 19 82
ST001828_RPpos_POS_ppm25_1031131529 
 133502 1107 1209
ST001217_RPpos__ppm25_1031115140 
 52472 333 368
ST002700_RPneg_466506 Lipkin negCSH 6550_B8_ppm25_103112379 
 26092 231 336
ST000046_RPneg__ppm25_1031114022 
 2171 9 65
ST002711_HILICneg_467535 Lipkin negHILIC 6550_B4_ppm25_1031115431 
 20303 191 316
MTBLS718_RPneg_LNEG_B2_ppm25_1031122629 
 5751 47 154
ST001828_RPneg_NEG_ppm25_1031133743 
 84791 496 454
ST000726_RPneg__ppm25_1031133359 
 22350 92 119
