# Get stats on Orbitrap datasets

This notebook explains the delta values within an RT window via isotopologues, adducts and fragments.

**How many features do they explain?**

Redo khipus using updated grid.

1. Get number of khipus per dataset, using updated grid.
2. Num total features within a RT khipu window.
3. How many are explained by probable fragments; num of features and khipus.
4. Compare MS2 for applicable khipus (neutral mass in MONA).
5. Check RT distribution for ions and fragments.


In [1]:
import sys
import json
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import find_peaks 
from scipy.ndimage import uniform_filter1d
import statsmodels.api as sm
from khipu.extended import peaklist_to_khipu_list, export_empCpd_khipu_list

sys.path.append("/Users/lish/li.github/consensus_serum_metabolome/utils")
from mining import * 

In [2]:
# pos ionization
isotope_search_patterns = [ (1.003355, '13C/12C', (0, 0.8)),
                            (2.00671, '13C/12C*2', (0, 0.8)),
                            (3.9948, '44Ca/40Ca', (0, 0.1)), # 2%
                            (1.9970, '37Cl/35Cl', (0.1, 0.8)), # 24.24%
                            ]

adduct_search_patterns = [  # initial patterns are relative to M+H+
                            (21.98194, 'Na/H'),
                            (41.026549, 'ACN'),     # Acetonitrile
                            (67.987424, 'NaCOOH'),
                            (37.955882, 'K/H'),
                            ]


In [3]:
orbi_datasets = [x.rstrip() for x in open('selected_45_orbi_datasets.txt').readlines()]
pos_orbi_datasets = [x for x in orbi_datasets if 'pos' in x]
neg_orbi_datasets = [x for x in orbi_datasets if 'neg' in x]

dict_tablefiles = {}
for line in open('list_input_files.tsv').readlines():
    a = line.rstrip().split('\t')
    dict_tablefiles[a[0]] = a[1]

dict_tablefiles[orbi_datasets[0]]

'/Users/lish/li.proj/serum_consensus_metabolome/orbitrap_v2/ST001237/study/HILICpos/B2/asari_ST001237_HILICpos_B2_ppm5_3524314/export/full_Feature_table.tsv'

In [4]:
dict_rtwindow = {}
for line in open('elution_parameters_45studies.tsv').readlines()[1:]:
    a = line.rstrip().split('\t')
    dict_rtwindow[a[0]] = float(a[5])
    
dict_rtwindow[orbi_datasets[0]]

0.613

In [None]:
def get_comprehensive_stats_per_dataset(file, dict_tablefiles, dict_rtwindow, ion_mode):
    '''
    returns list_khipus, all_assigned_fids, remaining_features
    all_assigned_fids are for features in khipus.
    '''
    _n, list_features = read_features_from_asari_table(
        open(dict_tablefiles[file]).read()
        )
    for f in list_features:
        f['representative_intensity'] = f['peak_area']
    list_khipus, all_assigned_fids = peaklist_to_khipu_list(
                            list_features, 
                            isotope_search_patterns=isotope_search_patterns, 
                            adduct_search_patterns=adduct_search_patterns,
                            extended_adducts=[],    # not to confuse later analysis of ISF
                            mz_tolerance_ppm=5,
                            rt_tolerance=dict_rtwindow[file],
                            mode=ion_mode,
                            charges=[1, 2, 3],
                            )
    # remaining_features = [f for f in list_features if f['id'] not in all_assigned_fids]
    return export_empCpd_khipu_list(list_khipus), all_assigned_fids, list_features
    
def get_features_in_rtwindow(list_features, rt_ref, rt_stdev):
    '''
    returns features in list_features that are within rt_stdev
    '''
    return [f for f in list_features if abs(f['rtime']-rt_ref) <= rt_stdev]   

def get_khipus_in_rtwindow(list_khipus, rt_ref, rt_stdev):
    '''
    returns features in list_features that are within rt_stdev
    '''
    return [f for f in list_khipus if abs(f['MS1_pseudo_Spectra'][0]['rtime']-rt_ref) <= rt_stdev]  

In [6]:
dict_rtwindow[orbi_datasets[0]]

0.613

In [75]:
list_khipus, all_assigned_fids, list_features = get_comprehensive_stats_per_dataset(
    orbi_datasets[0], dict_tablefiles, dict_rtwindow, 'pos'
)

table header looks like: 
   ['id_number', 'mz', 'rtime', 'rtime_left_base', 'rtime_right_base', 'parent_masstrack_id', 'peak_area', 'cSelectivity', 'goodness_fitting', 'snr', 'detection_counts', '1120a_Marios_PD1_Inhib2_HP-PREFA58', '1040_Marios_PD1_Inhib2_HP-EA717425-8', '1040a_Marios_PD1_Inhib2_HP-PREFA54', '1040b_Marios_PD1_Inhib2_HP-PREFB54', '1041_Marios_PD1_Inhib2_HP-E9133655-7', '1042_Marios_PD1_Inhib2_HP-L1376088-6', '1043_Marios_PD1_Inhib2_HP-L1305560-7', '1044_Marios_PD1_Inhib2_HP-L1370589-6', '1045_Marios_PD1_Inhib2_HP-E9123619-6']
Read 57875 feature lines


Multiple charges considered: [1, 2, 3]


Khipu search grid: 
               M+H+       Na/H        K/H        ACN     NaCOOH
M0         1.007276  22.989216  38.963158  42.033825  68.994700
13C/12C    2.010631  23.992571  39.966513  43.037180  69.998055
37Cl/35Cl  3.004276  24.986216  40.960158  44.030825  70.991700
13C/12C*2  3.013986  24.995926  40.969868  44.040535  71.001410
44Ca/40Ca  5.002076  26.984016  42.957958 

In [None]:
remaining_features = [f for f in list_features if f['id'] not in all_assigned_fids]
len(list_khipus), len(all_assigned_fids), len(remaining_features)

(6264, 15077, 42798)

In [77]:
list_khipus[1999]

{'interim_id': 'kp2000_320.235',
 'neutral_formula_mass': 320.23504603323,
 'neutral_formula': None,
 'Database_referred': [],
 'identity': [],
 'MS1_pseudo_Spectra': [{'id_number': 'F33277',
   'id': 'F33277',
   'mz': 321.2422,
   'rtime': 101.28,
   'apex': 101.28,
   'left_base': 98.07,
   'right_base': 103.96,
   'parent_masstrack_id': '321.2422',
   'peak_area': '118276008',
   'cSelectivity': '0.9',
   'goodness_fitting': '0.95',
   'snr': '105',
   'detection_counts': '96',
   'representative_intensity': '118276008',
   'isotope': 'M0',
   'modification': 'M+H+',
   'ion_relation': 'M0,M+H+'},
  {'id_number': 'F34809',
   'id': 'F34809',
   'mz': 322.2458,
   'rtime': 100.75,
   'apex': 100.75,
   'left_base': 97.81,
   'right_base': 103.96,
   'parent_masstrack_id': '322.2458',
   'peak_area': '8702103',
   'cSelectivity': '0.97',
   'goodness_fitting': '0.81',
   'snr': '713',
   'detection_counts': '9',
   'representative_intensity': '8702103',
   'isotope': '13C/12C',
   'm

In [78]:
_bins = []
for k in list_khipus:
    M0 = get_M0(k['MS1_pseudo_Spectra'])
    _bins += [f['id'] for f in get_features_in_rtwindow(remaining_features, M0['rtime'], 0.613)]
len(_bins), len(set(_bins))

(807191, 36657)

In [79]:
all_features = [] + remaining_features
for k in list_khipus:
    all_features.append(get_M0(k['MS1_pseudo_Spectra']))
len(all_features)

49062

**Not many features are out of khipu+rt windows.**

Now search against each khipu, to see how many ISFs may exist.


In [11]:
candidate_fragments = '''14.0155	792	14.0155	['14.01565', "± CH2, alkane chains, waxes, fatty acids, methylation; or '-[C3H6ON] <-> -[C2H4ON], acrylamide versus iodoacetamide in cysteine alkylation (gels)", "{'C': 1, 'H': 2}"]
18.0104	780	18.0104	['-18.010565', 'H2O', "{'H': -2, 'O': -1}"]
2.0156	634	2.0156	['2.01565', '± 2H, opening or forming of double bond', "{'H': 2}"]
28.0312	550	28.0312	['28.0313', '± C2H4, natural alkane chains such as fatty acids', "{'C': 2, 'H': 4}"]
15.9948	420	15.9948	['15.99492', '± O, e.g. oxidation/reduction', "{'O': 1}"]
17.0264	404	17.0264	['-17.026549', 'NH3', "{'N': -1, 'H': -3}"]
26.0155	392	26.0155	[' C2H2']
27.9948	385	27.9948	['27.99492', '± CO', "{'C': 1, 'O': 1}"]
32.026	311	32.0261	['32.026215', 'MeOH', "{'C': 1, 'H': 4, 'O': 1}"]
42.0104	301	42.0104	['42.01057', '± COCH2', "{'C': 2, 'O': 1, 'H': 2}"]
67.9872	295	67.9873	['67.987424', 'NaCOOH', "{'C': 1, 'O': 2, 'Na': 1, 'H': 1}"]
13.9791	287	13.9791	['13.97927', 'O <-> 2H, e.g. Oxidation follwed by H2O elimination', "{'H': -2, 'O': 1}"]
42.0468	278	42.0468	['42.04695', '± C3H6, propylation', "{'C': 3, 'H': 6}"]
46.0053	277	46.0053	['-46.005479', 'H2O+CO', "{'C': -1, 'H': -2, 'O': -2}"]
'''
candidate_fragments = [
    (float(x.split()[0]), x) for x in candidate_fragments.splitlines()
]
len(candidate_fragments), candidate_fragments[3]

(14,
 (28.0312,
  '28.0312\t550\t28.0312\t[\'28.0313\', \'± C2H4, natural alkane chains such as fatty acids\', "{\'C\': 2, \'H\': 4}"]'))

In [12]:
# sort khipus by mass
list_khipus = sorted(list_khipus, key=lambda x: x['neutral_formula_mass'], reverse=True)

In [81]:
def mz_in_list(mz, mlist, max_diff=0.0005, ppm=5):
    '''
    mz can be zero, which is False match.
    '''
    r = False
    if mz > max_diff:
        deltas = sorted([abs(x-mz) for x in mlist])
        if deltas[0] <= max_diff or deltas[0]/mz < ppm*1e-6:
            r = True
    return r 
    
def explain_a_dataset(list_khipus, remaining_features, isf_candidate_fragments, rt_stdev=0.613):
    '''
    by isf_candidate_fragments
    '''
    explained_khipu_ids, explained_feature_ids = [], []
    delta_values_used = []
    
    for ii in range(len(list_khipus)-1):
        rt_ref = list_khipus[ii]['MS1_pseudo_Spectra'][0]['rtime']
        base_mz = get_M0(list_khipus[ii]['MS1_pseudo_Spectra'])['mz']
        khipus_in_rtwindow = get_khipus_in_rtwindow(
            list_khipus[ii+1:], 
            rt_ref, 
            rt_stdev)
        for k in khipus_in_rtwindow:
            _d = list_khipus[ii]['neutral_formula_mass']-k['neutral_formula_mass']
            if mz_in_list(_d, isf_candidate_fragments):
                explained_khipu_ids.append(k['interim_id'])
                delta_values_used.append(_d)
            
        features_in_rtwindow = get_features_in_rtwindow(
            remaining_features, 
            rt_ref, 
            rt_stdev)
        for f in features_in_rtwindow:
            _d = base_mz - f['mz']
            if mz_in_list(
                _d, isf_candidate_fragments
            ):
                explained_feature_ids.append(f['id'])
                delta_values_used.append(_d)
                
    return explained_khipu_ids, explained_feature_ids, delta_values_used

In [82]:
isf_candidate_fragments = [x[0] for x in candidate_fragments]

explained_khipu_ids, explained_feature_ids, delta_values_used = explain_a_dataset(list_khipus, 
                                                               remaining_features,
                                                               isf_candidate_fragments)
len(set(explained_khipu_ids)), len(set(explained_feature_ids))
    

(355, 1061)

In [85]:
sorted(delta_values_used)[600:606]

[15.994853099999943,
 15.994855833333318,
 15.994892499999992,
 15.994899999999973,
 15.994899999999973,
 15.994899999999973]

**Common fragments do not explain many additional features**

Above are not big numbers, given (5590, 44895) to start with.


Try MoNA MS2 search next. 

In [54]:
from matchms.importing import load_from_msp
from mass2chem.search import build_centurion_tree, find_all_matches_centurion_indexed_list

from asari.tools import match_features as mf
from asari.mass_functions import mass_paired_mapping, all_mass_paired_mapping

In [None]:
# from JM
path = '../annotation_sources/filtered_MoNA-export-LC-MS-MS_Positive_Mode.msp'

spectral_registry = {}
total = 0
for x in load_from_msp(path):
    try:
        inchikey = x.metadata_dict()['inchikey']
        if inchikey:
            if inchikey not in spectral_registry:
                spectral_registry[inchikey] = []
            spectral_registry[inchikey].append(x)
            total += 1
    except:
        pass

print("filtered MS2 Size: ", str(len(spectral_registry)), " Compounds with ", str(total), " MS2 Spectra")


Unfiltered MS2 Size:  13972  Compounds with  13972  MS2 Spectra


In [17]:
spectral_registry['AACVPYUISGWNOU-WYMPLXKRSA-N']

[Spectrum(precursor m/z=1063.54, 413 fragments between 76.0 and 1806.6)]

In [46]:
def extract_ms2_spectrum(sp, N=5):
    '''
    returns _precursor, _name, top N (intensity, mz)
    '''
    _d = sp.metadata_dict()
    _precursor, _name = _d['precursor_mz'], _d['compound_name']
    imz = zip(sp.peaks.intensities, sp.peaks.mz)
    imz = [x for x in imz if x[1] < _precursor - 0.01] # excluding _precursor
    return _precursor, _name, sorted(imz, reverse=True)[:N]

extract_ms2_spectrum(spectral_registry['AACVPYUISGWNOU-WYMPLXKRSA-N'][0])

(1063.54,
 'NCGC00385453-01_C53H84O20_(3beta,12beta,14beta)-3-{[beta-D-Glucopyranosyl-(1->4)-6-deoxy-3-O-methyl-beta-D-glucopyranosyl-(1->4)-2,6-dideoxy-3-O-methyl-beta-D-ribo-hexopyranosyl-(1->4)-2,6-dideoxy-3-O-methyl-beta-D-ribo-hexopyranosyl]oxy}-14-hydroxy-20-oxopregn-5-en-12-yl (2E)-2-methyl-2-butenoate',
 [(26.896237, 489.196625),
  (24.475206, 963.50415),
  (15.594292, 964.493103),
  (9.966106, 965.499939),
  (4.45185, 490.196716)])

In [None]:
# ms2List is usable MoNA MS/MS compounds

ms2List, no_precursor = [], []
for sps in spectral_registry.values(): 
    try:
        ms2List.append(extract_ms2_spectrum(sps[0])) 
    except KeyError:
        no_precursor.append(sps[0])
        
print(len(ms2List))

mz_ms2List = [x[0] for x in ms2List]
mz_khipu_features = [get_M0(k['MS1_pseudo_Spectra'])['mz'] for k in list_khipus]
mz_remaining_features = [f['mz'] for f in remaining_features]

# How many precursors match
found_ms2list = all_mass_paired_mapping(mz_ms2List, mz_khipu_features, std_ppm=5)[0]
found_2_ms2list = all_mass_paired_mapping(mz_ms2List, mz_remaining_features, std_ppm=5)[0]

print(len(found_ms2list), len(found_2_ms2list))

13672
3217 14884


In [48]:
found_ms2list[33]

(6218, 443)

In [49]:
mz_khipu_features[443], ms2List[6218]

(448.0956,
 (448.096631171168,
  'MMV671636',
  [(1.06, 270.0732),
   (0.81, 214.8437),
   (0.3, 361.0909),
   (0.17, 430.9724),
   (0.17, 72.0323)]))

In [53]:
MS2flist = []
for x in ms2List:
    MS2flist.append({
        'mz': x[0],
        'rtime': 0,
        'name': x[1],
        'peaks': x[2]
    })
    
tree_MS2flist = build_centurion_tree(MS2flist)

In [None]:
def test_match_ms2(mzs_in_rtbin, ms2_fragments, limit_ppm=5):
    '''
    returns fragments that are matched in mzs_in_rtbin
    
    ms2_precursor and mz should already match before this.
    '''
    found = []
    if mzs_in_rtbin and ms2_fragments:
        for mz in ms2_fragments:
            if min([abs(mz-x) for x in mzs_in_rtbin]) < 0.000001*limit_ppm*mz:
                found.append(mz)
    return found

In [61]:
# 
# 
matched = []
for k in list_khipus:
    m0 = get_M0(k['MS1_pseudo_Spectra'])
    in_rtwindow = get_features_in_rtwindow(all_features, m0['rtime'], 0.613)
    in_rtwindow = [f['mz'] for f in in_rtwindow if f['mz'] < m0['mz']]
    # find matched precursor
    precursors = find_all_matches_centurion_indexed_list(m0['mz'], tree_MS2flist, limit_ppm=5)
    for sp in precursors:
        tested = test_match_ms2(in_rtwindow, [x[1] for x in sp['peaks']], 5)
        if tested:
            matched.append((k['interim_id'], sp, tested))
            


In [64]:
_more = []
for m in matched:
    _more += m[2]
    
len(_more), len(set(_more))

(509, 448)

In [None]:

matched2 = []
for ff in remaining_features:
    in_rtwindow = get_features_in_rtwindow(all_features, ff['rtime'], 0.613)
    in_rtwindow = [f['mz'] for f in in_rtwindow if f['mz'] < ff['mz']]
    # find matched precursor
    precursors = find_all_matches_centurion_indexed_list(ff['mz'], tree_MS2flist, limit_ppm=5)
    for sp in precursors:
        tested = test_match_ms2(in_rtwindow, [x[1] for x in sp['peaks']], 5)
        if tested:
            matched2.append((ff['id_number'], sp, tested))

In [68]:
_more = []
for m in matched+matched2:
    _more += m[2]
    
len(matched), len(matched2), len(_more), len(set(_more))

(377, 616, 1225, 1016)

In [87]:
matched[99]

('kp2742_359.1429',
 {'mz': 360.1485,
  'rtime': 0,
  'name': 'Rabeprazole',
  'peaks': [(100.0, 210.12346),
   (47.886888, 167.1178),
   (11.076144, 193.09705),
   (8.398393, 168.10135),
   (8.348258, 150.09135)]},
 [167.1178])

**1016 features are matched to any of the MS2 peaks, out of 993 precursors**

Using 13672 cpds from MoNA.

This dataset: 6264 khipus, (15077, 42798) features

**Next, do all datasets. Pos data first.**

In [92]:
def explain_a_dataset_byMS2(all_features, ms2tree, rt_stdev, limit_ppm=5):
    '''
    tree_MS2flist = build_centurion_tree(MS2flist)
    example MS2:  {'mz': 209.081,
        'rtime': 0,
        'name': 'Pyrenocin A - NCGC00169582-02_C11H12O4_1H-2-Benzopyran-1-one, 3,4-dihydro-6-hydroxy-8-methoxy-3-methyl-',
        'peaks': [(100.0, 163.075516),
        (96.239927, 191.070282),
        (26.604602, 103.053802),
        (19.662868, 135.080414),
        (18.420266, 148.05191)]},
    example result from test_match_ms2: [191.070282]
    '''   
    matched2, _have_precursors = [], []
    for ff in all_features:
        in_rtwindow = get_features_in_rtwindow(all_features, ff['rtime'], rt_stdev)
        in_rtwindow = [x['mz'] for x in in_rtwindow if x['mz'] < ff['mz']+0.0005]
        # find matched precursor
        precursors = find_all_matches_centurion_indexed_list(
            ff['mz'], ms2tree, limit_ppm=limit_ppm)
        if precursors:
            _have_precursors.append(ff['id_number'])
        for sp in precursors:
            tested = test_match_ms2(in_rtwindow, [x[1] for x in sp['peaks']], limit_ppm)
            if tested:
                _deltas = [sp['mz']-x for x in tested]
                matched2.append((ff['id_number'], sp['name'], _deltas))
                
    return _have_precursors, matched2
                

In [None]:
# Run took 3 minutes for this dataset
matched2 = explain_a_dataset_byMS2(all_features, tree_MS2flist, 0.613, 5)

In [91]:
len(matched2), matched2[88], len(set([x[0] for x in matched2]))

(993, ('F12751', 'Amino adipic acid dimethyl ester', [60.02099999999999]), 688)

In [None]:
# Collect both explanation of 
# isf_candidate_fragments : frequent detal
# and MoNA MS2 peaks
# 110 minutes

tally_pos = []
for f in pos_orbi_datasets:
    list_khipus, all_assigned_fids, list_features = get_comprehensive_stats_per_dataset(
        f, dict_tablefiles, dict_rtwindow, 'pos'
        )

    list_khipus = sorted(list_khipus, key=lambda x: x['neutral_formula_mass'], reverse=True)
    remaining_features = [f for f in list_features if f['id'] not in all_assigned_fids]
    
    # by isf_candidate_fragments
    explained_khipu_ids, explained_feature_ids, delta_values_used = explain_a_dataset(
        list_khipus, remaining_features, isf_candidate_fragments, 
        rt_stdev=dict_rtwindow[f]
        )
    
    # by MoNA MS2
    have_precursors, matched2 = explain_a_dataset_byMS2(
        list_khipus, tree_MS2flist, rt_stdev=dict_rtwindow[f])
    delta_values_ms2 = []
    for x in matched2:
        delta_values_ms2 += x[2]
    
    tally_pos.append(
        {
            'study': f,
            'num_khipus': len(list_khipus),
            'num_features':  len(list_features),
            'mzdelta_explained_khipus': len(set(explained_khipu_ids)), 
            'mzdelta_explained_features': len(set(explained_feature_ids)),
            'freq_delta_values_used': delta_values_used,
            'have_precursors': len(have_precursors),
            'ms2_explained_features': len(matched2),
            'delta_values_ms2': delta_values_ms2,
        }
    )

table header looks like: 
   ['id_number', 'mz', 'rtime', 'rtime_left_base', 'rtime_right_base', 'parent_masstrack_id', 'peak_area', 'cSelectivity', 'goodness_fitting', 'snr', 'detection_counts', '1120a_Marios_PD1_Inhib2_HP-PREFA58', '1040_Marios_PD1_Inhib2_HP-EA717425-8', '1040a_Marios_PD1_Inhib2_HP-PREFA54', '1040b_Marios_PD1_Inhib2_HP-PREFB54', '1041_Marios_PD1_Inhib2_HP-E9133655-7', '1042_Marios_PD1_Inhib2_HP-L1376088-6', '1043_Marios_PD1_Inhib2_HP-L1305560-7', '1044_Marios_PD1_Inhib2_HP-L1370589-6', '1045_Marios_PD1_Inhib2_HP-E9123619-6']
Read 57875 feature lines


Multiple charges considered: [1, 2, 3]


Khipu search grid: 
               M+H+       Na/H        K/H        ACN     NaCOOH
M0         1.007276  22.989216  38.963158  42.033825  68.994700
13C/12C    2.010631  23.992571  39.966513  43.037180  69.998055
37Cl/35Cl  3.004276  24.986216  40.960158  44.030825  70.991700
13C/12C*2  3.013986  24.995926  40.969868  44.040535  71.001410
44Ca/40Ca  5.002076  26.984016  42.957958 

In [95]:
len(tally_pos), tally_pos[8]['ms2_explained_features'], tally_pos[8]['mzdelta_explained_features']

(22, 844, 650)

In [96]:
len(tally_pos[8]['freq_delta_values_used']), len(tally_pos[8]['delta_values_ms2'])

(1360, 1030)

In [97]:
with open('isfExplained_result_tally_pos.json', 'w', encoding='utf-8') as f:
    json.dump(tally_pos, f,  ensure_ascii=False, indent=2) 

**Neg data**



In [None]:
# neg ionization
isotope_search_patterns = [ (1.003355, '13C/12C', (0, 0.8)),
                            (2.00671, '13C/12C*2', (0, 0.8)),
                            (1.9970, '37Cl/35Cl', (0.1, 0.8)), # 24.24%
                            ]

adduct_search_patterns = [  # initial patterns are relative to M+H+
                            (21.98194, 'Na/H'), (67.987424, 'NaCOOH'),
                            (82.0030, 'C2HF3'),
                            (1.99566, 'F <-> OH'), 
                            ]

In [None]:
# reused from JM
path = '../annotation_sources/filtered_MoNA-export-LC-MS-MS_Negative_Mode.msp'

spectral_registry = {}
total = 0
for x in load_from_msp(path):
    try:
        inchikey = x.metadata_dict()['inchikey']
        if inchikey:
            if inchikey not in spectral_registry:
                spectral_registry[inchikey] = []
            spectral_registry[inchikey].append(x)
            total += 1
    except:
        pass

print("MS2 #: ", str(len(spectral_registry)), " Compounds with ", str(total), " MS2 Spectra")

ms2List, no_precursor = [], []
for sps in spectral_registry.values(): 
    try:
        ms2List.append(extract_ms2_spectrum(sps[0])) 
    except KeyError:
        no_precursor.append(sps[0])
        
print(len(ms2List))

MS2flist = []
for x in ms2List:
    MS2flist.append({
        'mz': x[0],
        'rtime': 0,
        'name': x[1],
        'peaks': x[2]
    })
    
tree_MS2flist = build_centurion_tree(MS2flist)

Unfiltered MS2 Size:  9184  Compounds with  9184  MS2 Spectra
9078


In [None]:
# 134 minutes
tally_neg = []
for f in neg_orbi_datasets:
    list_khipus, all_assigned_fids, list_features = get_comprehensive_stats_per_dataset(
        f, dict_tablefiles, dict_rtwindow, 'neg'
        )

    list_khipus = sorted(list_khipus, key=lambda x: x['neutral_formula_mass'], reverse=True)
    remaining_features = [f for f in list_features if f['id'] not in all_assigned_fids]
    
    # by isf_candidate_fragments
    explained_khipu_ids, explained_feature_ids, delta_values_used = explain_a_dataset(
        list_khipus, remaining_features, isf_candidate_fragments, 
        rt_stdev=dict_rtwindow[f]
        )
    
    # by MoNA MS2
    have_precursors, matched2 = explain_a_dataset_byMS2(
        list_khipus, tree_MS2flist, rt_stdev=dict_rtwindow[f])
    delta_values_ms2 = []
    for x in matched2:
        delta_values_ms2 += x[2]
    
    tally_neg.append(
        {
            'study': f,
            'num_khipus': len(list_khipus),
            'num_features':  len(list_features),
            'mzdelta_explained_khipus': len(set(explained_khipu_ids)), 
            'mzdelta_explained_features': len(set(explained_feature_ids)),
            'freq_delta_values_used': delta_values_used,
            'have_precursors': len(have_precursors),
            'ms2_explained_features': len(matched2),
            'delta_values_ms2': delta_values_ms2,
        }
    )

table header looks like: 
   ['id_number', 'mz', 'rtime', 'rtime_left_base', 'rtime_right_base', 'parent_masstrack_id', 'peak_area', 'cSelectivity', 'goodness_fitting', 'snr', 'detection_counts', 'QXA13POL20180807_MSSM0118ML_HUMAN_SERUM4_35', 'QXA13POL20180807_MSSM0118ML_HUMAN_SERUM1_03', 'QXA13POL20180807_MSSM0118ML_HUMAN_SERUM1_04', 'QXA13POL20180807_MSSM0118ML_HUMAN_SERUM1_05', 'QXA13POL20180807_MSSM0118ML_HUMAN_SERUM1_06', 'QXA13POL20180807_MSSM0118ML_HUMAN_SERUM1_07', 'QXA13POL20180807_MSSM0118ML_HUMAN_SERUM1_08', 'QXA13POL20180807_MSSM0118ML_HUMAN_SERUM1_09', 'QXA13POL20180807_MSSM0118ML_HUMAN_SERUM1_10']
Read 35496 feature lines


Multiple charges considered: [1, 2, 3]


Khipu search grid: 
               M-H-       Na/H        K/H        ACN     NaCOOH
M0        -1.007276  20.974664  36.948606  40.019273  66.980148
13C/12C   -0.003921  21.978019  37.951961  41.022628  67.983503
37Cl/35Cl  0.989724  22.971664  38.945606  42.016273  68.977148
13C/12C*2  0.999434  22.981374  38.95

In [100]:
len(tally_neg), tally_neg[8]['ms2_explained_features'], tally_neg[8]['mzdelta_explained_features']

(23, 215, 486)

In [101]:
with open('isfExplained_result_tally_neg.json', 'w', encoding='utf-8') as f:
    json.dump(tally_neg, f,  ensure_ascii=False, indent=2) 

In [103]:
for x in tally_pos + tally_neg:
    print(x['study'], '\n', x['num_features'], x['mzdelta_explained_features'], x['ms2_explained_features'])

ST001237_HILICpos_B2_ppm5_3524314 
 42798 1061 993
ST002937_HILICpos_ST002937_pos_hilic_batch_1_zip_B2_ppm5_351419 
 124335 1884 993
ST002112_HILICpos_B2_ppm5_3545123 
 82833 1247 993
MTBLS4187_HILICpos_HILIC_ppm5_3521045 
 50806 879 993
MTBLS3852_HILICpos__ppm5_3533848 
 179960 1402 993
MTBLS1465_HILICpos__ppm5_3505731 
 60462 851 993
ST001335_HILICpos__ppm5_35132731 
 31061 680 993
ST001736_RPpos__ppm5_34232333 
 17971 702 993
MTBLS136_RPpos_B17_ppm5_3583756 
 18660 650 844
ST002200_RPpos_17min_B3_ppm5_3422144 
 35608 720 1256
ST001181_RPpos_B11_ppm5_36163417 
 28256 189 1256
ST002443_RPpos_B3_ppm5_81311338 
 23696 199 1256
ST002118_RPpos_B1_ppm5_36115624 
 38456 615 1256
ST003315_RPpos__ppm5_813163714 
 16471 349 695
ST002112_RPpos_B3_ppm5_356194 
 81564 2365 993
MTBLS4187_RPpos__ppm5_3522135 
 101807 1766 993
ST002049_RPpos_B1_ppm5_352018 
 39256 561 1256
MTBLS2245_RPpos_Method 2_B1_ppm5_81318250 
 15960 587 695
ST001430_RPpos_B1_ppm5_3423140 
 59122 787 989
MTBLS205_RPpos_B1_ppm5_

# Conclusion

This notebook tests how many features can be explained by frequent fragments or MS2 peaks.

The RT window does not guanrantee coelution. Next notebook will test the real RT distribution.

The notebook 6 will recalculate many features can be explained by khipu using comprehensive ion lists.
