# Get stats on Orbitrap datasets

This notebook explains the delta values within an RT window via isotopologues, adducts and fragments.

- How many are explained by probable fragments; num of features and khipus.
- How many are explained by MS2 (top 5 spectra per cpd in MoNA)


In [1]:
import io
import sys
import json
import contextlib
from matchms.importing import load_from_msp
from khipu.extended import peaklist_to_khipu_list, export_empCpd_khipu_list
from mass2chem.search import build_centurion_tree, find_all_matches_centurion_indexed_list

sys.path.insert(0, '..')
from mining import * 
from isf_helper import (extract_ms2_spectrum, 
                        get_comprehensive_stats_per_dataset, 
                        explain_a_dataset_by_mz_deltas, 
                        explain_a_dataset_byMS2)

## Isotopologue and Adducts from Step2

In [3]:
# pos
isp_pos = [ (1.003355, '13C/12C', (0, 0.8)),
                            (2.00671, '13C/12C*2', (0, 0.8)),
                            # (3.9948, '44Ca/40Ca', (0, 0.1)), # 2%
                            (1.9970, '37Cl/35Cl', (0.1, 0.8)), # 24.24%
                            ]

asp_pos = [  # initial patterns are relative to M+H+
                            (21.98194, 'Na/H'),
                            (41.026549, 'ACN'),     # Acetonitrile
                            (67.987424, 'NaCOOH'),
                            (37.955882, 'K/H'),
                            (32.026215, 'MeOH')
                            ]

In [4]:
# neg 
isp_neg = [ (1.003355, '13C/12C', (0, 0.8)),
                            (2.00671, '13C/12C*2', (0, 0.8)),
                            (1.9970, '37Cl/35Cl', (0.1, 0.8)), # 24.24%
                            (1.9958, '32S/34S', (0, 0.1)), # 4%
                            ]

asp_neg = [  # initial patterns are relative to M+H+
                            (21.98194, 'Na/H'), (67.987424, 'NaCOOH'),
                            (82.0030, 'NaCH2COOH'),
                            # (1.99566, 'F <-> OH'), 
                            ]

## Loading MoNA MS2

In [5]:
import tqdm
import logging
# suppress logging from matchms
logging.getLogger("matchms").setLevel(logging.ERROR)

In [6]:
def load_from_mona(path):
    '''read from mona ms2 file from a given path
    
    path: the path to mona .msp file
    
    return: the inchikey-spectra pair.
    '''
    # reused from JM
    spectral_registry = {}
    total = 0
    for x in tqdm.tqdm(load_from_msp(path)):
        try:
            inchikey = x.metadata_dict()['inchikey']
            if inchikey:
                if inchikey not in spectral_registry:
                    spectral_registry[inchikey] = []
                spectral_registry[inchikey].append(x)
                total += 1
        except:
            pass

    print("MS2 #: ", str(len(spectral_registry)), " Compounds with ", str(total), " MS2 Spectra")
    return spectral_registry


In [7]:
spectral_registry_pos = load_from_mona('../MoNA_MS2/filtered_MoNA-export-LC-MS-MS_Positive_Mode.msp')
spectral_registry_neg = load_from_mona('../MoNA_MS2/filtered_MoNA-export-LC-MS-MS_Negative_Mode.msp')

13973it [00:05, 2499.77it/s]


MS2 #:  13973  Compounds with  13973  MS2 Spectra


9184it [00:03, 2775.01it/s]

MS2 #:  9184  Compounds with  9184  MS2 Spectra





In [8]:
# ms2List is usable MoNA MS/MS compounds
ms2List_pos, no_precursor_pos = [], []
for sp in spectral_registry_pos.values(): 
    try:
        ms2List_pos.append(extract_ms2_spectrum(sp[0])) 
    except KeyError:
        no_precursor_pos.append(sp[0])
        
print(f'{len(ms2List_pos)} spectra are found with precursors.')

ms2List_neg, no_precursor_neg = [], []
for sp in spectral_registry_neg.values(): 
    try:
        ms2List_neg.append(extract_ms2_spectrum(sp[0])) 
    except KeyError:
        no_precursor_neg.append(sp[0])
        
print(f'{len(ms2List_neg)} spectra are found with precursors.')

13670 spectra are found with precursors.
9074 spectra are found with precursors.


In [9]:
ms2_tree_pos = build_centurion_tree([{'mz': x[0], 'rtime': 0, 'name': x[1], 'peaks': x[2]} for x in ms2List_pos])
ms2_tree_neg = build_centurion_tree([{'mz': x[0], 'rtime': 0, 'name': x[1], 'peaks': x[2]} for x in ms2List_neg])

## Runinng Batches

### Preparation

In [10]:
orbi_datasets = [x.rstrip() for x in open('selected_45_orbi_datasets.txt').readlines()]
pos_orbi_datasets = [x for x in orbi_datasets if 'pos' in x]
neg_orbi_datasets = [x for x in orbi_datasets if 'neg' in x]

In [11]:
dict_rtwindow = {}
for line in open('elution_parameters_45studies_orbi.tsv').readlines()[1:]:
    a = line.rstrip().split('\t')
    dict_rtwindow[a[0]] = float(a[5])

### Positive

In [12]:
pos_candidate_fragments = '''14.0155	900	14.015649	addition of acetic acid and loss of CO2. Reaction: (+C2H2O2) and (-CO2)	{'C': 1, 'H': 2}
18.0104	885	18.010565	water	{'H': 2, 'O': 1}
2.0155	717	2.01565	± 2H, opening or forming of double bond	{'H': 2}
44.0261	652	44.0262	hydroxyethylation	{'C': 2, 'H': 4, 'O': 1}
28.0312	621	28.0313	± C2H4, natural alkane chains such as fatty acids	{'C': 2, 'H': 4}
15.9948	479	15.9949	oxidation	{'O': 1}
17.0264	451	17.0265	addition of ammonia. Reaction: (+NH3)	{'N': 1, 'H': 3}
26.0155	440	26.01565	acetylation and loss of oxygen. Reaction: (+C2H2O) and (-O)	{'C': 2, 'H': 2}
27.9947	433	27.9949	addition of CO. Reaction: (+CO)	{'C': 1, 'O': 1}
11.9999	426	12.0	methylation and reduction	{'C': 1}
42.0104	340	42.010564	malonylation and loss of CO2. Reaction: (+C3H2O3) and (-CO2)	{'C': 2, 'H': 2, 'O': 1}
67.9872	325	67.987424	NaCOOH	{'C': 1, 'O': 2, 'Na': 1, 'H': 1}
13.9791	321	13.979264	nitrification and loss of oxygen. Reaction: (NH2 -> NO2) and (-O)	{'H': -2, 'O': 1}
23.9998	317	24.0	acetylation and loss of water. Reaction: (+C2H2O) and (-H2O)	{'C': 2}
16.0312	314	16.0313	Methylation + reduction	{'C': 1, 'H': 4}
42.0468	314	42.04695	± C3H6, propylation	{'C': 3, 'H': 6}
46.0053	313	46.005305	formic acid adduct	{'C': 1, 'H': 2, 'O': 2}
88.0522	304	88.052429	butanoic acid	{'C': 4, 'H': 8, 'O': 2}
41.0263	295	41.026549	Acetonitrile	{'C': 2, 'H': 3, 'N': 1}
30.0468	267	30.04695	addition of C2H4 and hydrogenation. Reaction: (+C2H4) and (+H2)	{'C': 2, 'H': 6}
'''
pos_candidate_fragments = [
    (float(x.split()[0]), x) for x in pos_candidate_fragments.splitlines()
]
pos_isf_candidate_fragments = [x[0] for x in pos_candidate_fragments]
len(pos_candidate_fragments), pos_candidate_fragments[3]

(20,
 (44.0261,
  "44.0261\t652\t44.0262\thydroxyethylation\t{'C': 2, 'H': 4, 'O': 1}"))

In [None]:
# tally_pos = []
# for study in pos_orbi_datasets:
#     list_khipus, all_assigned_fids, list_features = get_comprehensive_stats_per_dataset(
#         f'../input_data_orbi/{study}/full_feature_table.tsv', 
#         dict_rtwindow[study], 
#         isp_pos,
#         asp_pos,
#         'pos')
    
#     # sort to make sure we are getting in-source fragments
#     remaining_features = [f for f in list_features if f['id'] not in all_assigned_fids]
        
#     list_khipus = sorted(list_khipus, key=lambda x: x['neutral_formula_mass'], reverse=True)
    
#     # by isf_candidate_fragments
#     explained_khipu_ids, explained_feature_ids, delta_values_used = explain_a_dataset_by_mz_deltas(
#         list_khipus, remaining_features, pos_isf_candidate_fragments, 
#         rt_stdev=dict_rtwindow[study]
#         )
    
#     # # by MoNA MS2
#     have_precursors, matched2 = explain_a_dataset_byMS2(
#         list_features, ms2_tree_pos, rt_stdev=dict_rtwindow[study])
#     delta_values_ms2 = []
#     for x in matched2:
#         delta_values_ms2 += x[2]

#     tally_pos.append(
#         {
#             'study': study,
#             'num_khipus': len(list_khipus),
#             'num_features':  len(list_features),
#             'mzdelta_explained_khipus': len(set(explained_khipu_ids)), 
#             'mzdelta_explained_features': len(set(explained_feature_ids)),
#             'freq_delta_values_used': delta_values_used,
#             'have_precursors': len(have_precursors),
#             'ms2_explained_features': len(matched2),
#             'delta_values_ms2': delta_values_ms2,
#         }
#     )

# with open('isfExplained_result_tally_pos.json', 'w', encoding='utf-8') as f:
#     json.dump(tally_pos, f,  ensure_ascii=False, indent=2) 

The above code is a bit slow runinng in jupyter notebook. Alternative way is to run_stats_by_batches_orbi.py, and then assemble the result in the cell below.

In [14]:
tally_pos = []
for pos_res in os.listdir('output/orbi/01292025/pos'):
    tally_pos.append(json.load(open('output/orbi/01292025/pos/' + pos_res, 'r')))
print(len(tally_pos))
with open('isfExplained_result_tally_pos_orbi_01292025.json', 'w', encoding='utf-8') as f:
    json.dump(tally_pos, f,  ensure_ascii=False, indent=2) 

22


### Negative

In [13]:
# selected from top 25

neg_candidate_fragments = '''67.9874	819	67.987424	NaCOOH	{'C': 1, 'O': 2, 'Na': 1, 'H': 1}
14.0156	693	14.015649	addition of acetic acid and loss of CO2. Reaction: (+C2H2O2) and (-CO2)	{'C': 1, 'H': 2}
2.0155	570	2.01565	"± 2H, opening or forming of double bond"	{'H': 2}
82.0029	431	82.005479	succinylation and loss of water. Reaction: (+C4H4O3) and (-H2O)	{'C': 4, 'H': 2, 'O': 2}
15.9948	415	15.9949	oxidation	{'O': 1}
43.9898	394	43.9898	addition of CO2. Reaction: (+CO2)	{'C': 1, 'O': 2}
18.0105	374	18.010565	water	{'H': 2, 'O': 1}
11.9999	359	12	methylation and reduction	{'C': 1}
30.0105	352	30.010564	addition of acetic acid and loss of CO. Reaction: (+C2H2O2) and (-CO)	{'C': 1, 'H': 2, 'O': 1}
26.0156	346	26.01565	acetylation and loss of oxygen. Reaction: (+C2H2O) and (-O)	{'C': 2, 'H': 2}
46.0054	339	46.005479	formic acid adduct	{'C': 1, 'H': 2, 'O': 2}
28.0312	327	28.0313	± C2H4, natural alkane chains such as fatty acids	{'C': 2, 'H': 4}
44.0261	303	44.0262	hydroxyethylation	{'C': 2, 'H': 4, 'O': 1}
27.9949	297	27.9949	addition of CO. Reaction: (+CO)	{'C': 1, 'O': 1}
23.9999	265	24	acetylation and loss of water. Reaction: (+C2H2O) and (-H2O)	{'C': 2}
13.9792	256	13.979264	nitrification and loss of oxygen. Reaction: (NH2 -> NO2) and (-O)	{'H': -2, 'O': 1}
42.0105	250	42.010564	malonylation and loss of CO2. Reaction: (+C3H2O3) and (-CO2)	{'C': 2, 'H': 2, 'O': 1}
16.0312	239	16.0313	Methylation + reduction	{'C': 1, 'H': 4}
60.021	229	60.02113	acetylation and addition of water. Reaction: (+C2H2O) and (+H2O)	{'C': 2, 'H': 4, 'O': 2}
135.9748	222	135.974848	2X NaCOOH	{'C': 2, 'O': 4, 'H': 2, 'Na': 2}
'''
neg_candidate_fragments = [
    (float(x.split()[0]), x) for x in neg_candidate_fragments.splitlines()
]

neg_isf_candidate_fragments = [x[0] for x in neg_candidate_fragments]
len(neg_candidate_fragments), neg_candidate_fragments[3]

(20,
 (82.0029,
  "82.0029\t431\t82.005479\tsuccinylation and loss of water. Reaction: (+C4H4O3) and (-H2O)\t{'C': 4, 'H': 2, 'O': 2}"))

In [None]:
# tally_neg = []
# for study in neg_orbi_datasets:
#     list_khipus, all_assigned_fids, list_features = get_comprehensive_stats_per_dataset(
#         f'../input_data_orbi/{study}/full_feature_table.tsv', 
#         dict_rtwindow[study], 
#         isp_neg,
#         asp_neg,
#         'neg')
    
#     # sort to make sure we are getting in-source fragments
#     remaining_features = [f for f in list_features if f['id'] not in all_assigned_fids]

#     list_khipus = sorted(list_khipus, key=lambda x: x['neutral_formula_mass'], reverse=True)
    
#     # by isf_candidate_fragments
#     explained_khipu_ids, explained_feature_ids, delta_values_used = explain_a_dataset_by_mz_deltas(
#         list_khipus, remaining_features, neg_isf_candidate_fragments, 
#         rt_stdev=dict_rtwindow[study]
#         )
    
#     # # by MoNA MS2
#     have_precursors, matched2 = explain_a_dataset_byMS2(
#         list_features, ms2_tree_neg, rt_stdev=dict_rtwindow[study])
#     delta_values_ms2 = []
#     for x in matched2:
#         delta_values_ms2 += x[2]

#     tally_neg.append(
#         {
#             'study': study,
#             'num_khipus': len(list_khipus),
#             'num_features':  len(list_features),
#             'mzdelta_explained_khipus': len(set(explained_khipu_ids)), 
#             'mzdelta_explained_features': len(set(explained_feature_ids)),
#             'freq_delta_values_used': delta_values_used,
#             'have_precursors': len(have_precursors),
#             'ms2_explained_features': len(matched2),
#             'delta_values_ms2': delta_values_ms2,
#         }
#     )
    
# with open('isfExplained_result_tally_neg.json', 'w', encoding='utf-8') as f:
#     json.dump(tally_neg, f,  ensure_ascii=False, indent=2) 

The above code is a bit slow runinng in jupyter notebook. Alternative way is to run_stats_by_batches_orbi.py, and then assemble the result in the cell below.

In [15]:
tally_neg = []
for neg_res in os.listdir('output/orbi/01292025/neg'):
    tally_neg.append(json.load(open('output/orbi/01292025/neg/' + neg_res, 'r')))
print(len(tally_neg))
with open('isfExplained_result_tally_neg_orbi_01292025.json', 'w', encoding='utf-8') as f:
    json.dump(tally_neg, f,  ensure_ascii=False, indent=2) 

23


## Summary

In [5]:
for x in tally_pos + tally_neg:
    print(x['study'], '\n', x['num_features'], x['mzdelta_explained_features'], x['ms2_explained_features'])

ST002112_HILICpos_B2_ppm5_3545123 
 107975 1542 1419
ST002112_RPpos_B3_ppm5_356194 
 121340 2601 3013
ST002049_RPpos_B1_ppm5_352018 
 50895 673 506
MTBLS4187_RPpos__ppm5_3522135 
 147760 2285 1499
ST001736_RPpos__ppm5_34232333 
 27515 933 1208
ST002443_RPpos_B3_ppm5_81311338 
 27364 225 303
ST003315_RPpos__ppm5_813163714 
 22843 426 324
ST002118_RPpos_B1_ppm5_36115624 
 49322 648 438
MTBLS2245_RPpos_Method 2_B1_ppm5_81318250 
 23835 717 711
ST001430_RPpos_B1_ppm5_3423140 
 76745 804 791
ST001237_HILICpos_B2_ppm5_3524314 
 57875 1222 1505
MTBLS1465_HILICpos__ppm5_3505731 
 76457 1054 1191
ST001335_HILICpos__ppm5_35132731 
 46855 897 608
MTBLS4187_HILICpos_HILIC_ppm5_3521045 
 68156 1133 1501
ST001243_RPpos__ppm5_35185010 
 30113 713 559
ST002200_RPpos_17min_B3_ppm5_3422144 
 50647 901 600
ST002521_RPpos_B3_ppm5_3662420 
 147506 2291 421
MTBLS205_RPpos_B1_ppm5_3531210 
 21387 897 792
MTBLS136_RPpos_B17_ppm5_3583756 
 28969 865 644
ST002937_HILICpos_ST002937_pos_hilic_batch_1_zip_B2_ppm5_