# Get Stats on Example Dataset
This notebook contains the step before extending to all selected datasets

In [39]:
import io
import sys
import json
import contextlib
from khipu.extended import peaklist_to_khipu_list, export_empCpd_khipu_list

sys.path.insert(0, '..')
from mining import * 

## Isotopologue and Adducts from Step2

In [40]:
# pos
isp_pos = [ (1.003355, '13C/12C', (0, 0.8)),
                            (2.00671, '13C/12C*2', (0, 0.8)),
                            (3.9948, '44Ca/40Ca', (0, 0.1)), # 2%
                            (1.9970, '37Cl/35Cl', (0.1, 0.8)), # 24.24%
                            ]

asp_pos = [  # initial patterns are relative to M+H+
                            (21.98194, 'Na/H'),
                            (41.026549, 'ACN'),     # Acetonitrile
                            (67.987424, 'NaCOOH'),
                            (37.955882, 'K/H'),
                            ]

dataset_name = 'ST001237_HILICpos_B2_ppm5_3524314'
rtime_tolerance = 0.613

## Some Reused Functions

In [41]:
def get_features_in_rtwindow(list_features, rt_ref, rt_stdev):
    '''returns features in list_features that are within rt_stdev
    
    list_features: full list of features
    rt_ref: the reference retention time as the center of window
    rt_stdev: tolerance of retention time selecting
    
    return: list of features inside the given window
    '''
    return [f for f in list_features if abs(f['rtime']-rt_ref) <= rt_stdev]   

def get_khipus_in_rtwindow(list_khipus, rt_ref, rt_stdev):
    '''returns khipus in list_khipus that are within rt_stdev
    
    list_khipus: given list of khipus
    rt_ref: the reference retention time as the center of window
    rt_stdev: tolerance of retention time selecting
    
    return: list of khipus inside the given window
    '''
    return [f for f in list_khipus if abs(f['MS1_pseudo_Spectra'][0]['rtime']-rt_ref) <= rt_stdev]  

## Extract Khipus and Features from Example Dataset

In [42]:
def get_comprehensive_stats_per_dataset(full_table_path, rt_tolerance, isotope_search_patterns, adduct_search_patterns, ion_mode):
    '''construct khipus from given features and related information
    
    full_table_path: the path to the full feature table of current dataset
    rt_tolerance: the retention time tolerance of current dataset. The tolerance is generated in step2 in elution_parameters_45studies.tsv
    ion_mode: the ionization mode of current dataset
    
    returns: 
    list_khipus: khipu list from given feature list
    all_assigned_fids: feature id list of the features being used in khipus
    list_features: all feature list
    '''
    with contextlib.redirect_stdout(io.StringIO()):
        _n, list_features = read_features_from_asari_table(open(full_table_path).read())
        
        for f in list_features:
            f['representative_intensity'] = f['peak_area']
        list_khipus, all_assigned_fids = peaklist_to_khipu_list(
                                list_features, 
                                isotope_search_patterns=isotope_search_patterns, 
                                adduct_search_patterns=adduct_search_patterns,
                                extended_adducts=[],    # not to confuse later analysis of ISF
                                mz_tolerance_ppm=5,
                                rt_tolerance=rt_tolerance,
                                mode=ion_mode,
                                charges=[1, 2, 3],
                                )
        # remaining_features = [f for f in list_features if f['id'] not in all_assigned_fids]
    return export_empCpd_khipu_list(list_khipus), all_assigned_fids, list_features


In [43]:
list_khipus, all_assigned_fids, list_features = get_comprehensive_stats_per_dataset(
    f'../input_data_orbi/{dataset_name}/full_feature_table.tsv', rtime_tolerance,
    isp_pos, asp_pos, 'pos')
# here singletons mean the features they are not used in khipu assembling
remaining_features = [f for f in list_features if f['id'] not in all_assigned_fids] 

all_representative_features = [] + remaining_features
for k in list_khipus:
    all_representative_features.append(get_M0(k['MS1_pseudo_Spectra']))
    
print(f'From {dataset_name}, we got {len(list_khipus)} khipus, {len(remaining_features)} singletons, ' +\
      f'{len(list_features)} features, {len(all_representative_features)} representative features.')



From ST001237_HILICpos_B2_ppm5_3524314, we got 6254 khipus, 42823 singletons, 57875 features, 49077 representative features.


The representative features only include khipu features who are M0

## Use Frequent MZ Delta to Explain 

In [44]:
candidate_fragments = '''14.0155	792	14.0155	['14.01565', "± CH2, alkane chains, waxes, fatty acids, methylation; or '-[C3H6ON] <-> -[C2H4ON], acrylamide versus iodoacetamide in cysteine alkylation (gels)", "{'C': 1, 'H': 2}"]
18.0104	780	18.0104	['-18.010565', 'H2O', "{'H': -2, 'O': -1}"]
2.0156	634	2.0156	['2.01565', '± 2H, opening or forming of double bond', "{'H': 2}"]
28.0312	550	28.0312	['28.0313', '± C2H4, natural alkane chains such as fatty acids', "{'C': 2, 'H': 4}"]
15.9948	420	15.9948	['15.99492', '± O, e.g. oxidation/reduction', "{'O': 1}"]
17.0264	404	17.0264	['-17.026549', 'NH3', "{'N': -1, 'H': -3}"]
26.0155	392	26.0155	[' C2H2']
27.9948	385	27.9948	['27.99492', '± CO', "{'C': 1, 'O': 1}"]
32.026	311	32.0261	['32.026215', 'MeOH', "{'C': 1, 'H': 4, 'O': 1}"]
42.0104	301	42.0104	['42.01057', '± COCH2', "{'C': 2, 'O': 1, 'H': 2}"]
67.9872	295	67.9873	['67.987424', 'NaCOOH', "{'C': 1, 'O': 2, 'Na': 1, 'H': 1}"]
13.9791	287	13.9791	['13.97927', 'O <-> 2H, e.g. Oxidation follwed by H2O elimination', "{'H': -2, 'O': 1}"]
42.0468	278	42.0468	['42.04695', '± C3H6, propylation', "{'C': 3, 'H': 6}"]
46.0053	277	46.0053	['-46.005479', 'H2O+CO', "{'C': -1, 'H': -2, 'O': -2}"]
'''
candidate_fragments = [
    (float(x.split()[0]), x) for x in candidate_fragments.splitlines()
]
len(candidate_fragments), candidate_fragments[3]

(14,
 (28.0312,
  '28.0312\t550\t28.0312\t[\'28.0313\', \'± C2H4, natural alkane chains such as fatty acids\', "{\'C\': 2, \'H\': 4}"]'))

In [45]:
# sort khipus by mass
list_khipus = sorted(list_khipus, key=lambda x: x['neutral_formula_mass'], reverse=True)

In [50]:

def explain_a_dataset_by_mz_deltas(list_khipus, remaining_features, isf_candidate_fragments, rt_stdev=0.613):
    '''map the pairwise mass distance of the khipus and 'free' features to isf_candidate_fragments
    
    list_khipus: list of khipus
    remaining_features: list of orphan features
    isf_candidate_fragments: list of the most frequent delta mass values 
    rt_stdev: half window of rt tolerance
    
    return:
    explained_khipu_ids: list of explained khipu ids
    explained_feature_ids: list of explained feature ids
    delta_values_used: list of delta mz values used for explanation
    '''
    def mz_delta_in_list(mz, mlist, max_diff=0.0005, ppm=5):
        '''check if the given mz value is in given mz list(meet the requirement of certain number or ppm)
        
        mz: a float number of m/z delta value
        mlist: a list of m/z value
        max_diff: fixed number tolerance
        ppm: ppm tolerance
        
        return: a boolean value indicating if the given mz sits in the window of any mz value in the given mlist.
        '''
        r = False
        if mz > max_diff:
            deltas = sorted([abs(x-mz) for x in mlist])
            if deltas[0] <= max_diff or deltas[0]/mz < ppm*1e-6:
                r = True
        return r 
    
    explained_khipu_ids, explained_feature_ids, delta_values_used = [], [], []

    # iterate through given khipu list
    for ii in range(len(list_khipus)-1):
        # get rtime of current khipu
        rt_ref = list_khipus[ii]['MS1_pseudo_Spectra'][0]['rtime']
        # get mz of M0 feature in current khipu
        base_mz = get_M0(list_khipus[ii]['MS1_pseudo_Spectra'])['mz']
        # get list of khipus whose mass value is bigger than current one
        khipus_in_rtwindow = get_khipus_in_rtwindow(
            list_khipus[ii+1:], 
            rt_ref, 
            rt_stdev)
        # iterate through the khipus in rtime window to get whose delta to given one matching isf_candidate_fragments
        for k in khipus_in_rtwindow:
            _d = list_khipus[ii]['neutral_formula_mass']-k['neutral_formula_mass']
            if mz_delta_in_list(_d, isf_candidate_fragments):
                explained_khipu_ids.append(k['interim_id'])
                delta_values_used.append((_d, k['interim_id'], rt_ref-get_M0(k['MS1_pseudo_Spectra'])['rtime']))
        
        # iterate through the features in rtime window to get whose delta to given one matching isf_candidate_fragments
        features_in_rtwindow = get_features_in_rtwindow(
            remaining_features, 
            rt_ref, 
            rt_stdev)
        for f in features_in_rtwindow:
            _d = base_mz - f['mz']
            if mz_delta_in_list(_d, isf_candidate_fragments):
                explained_feature_ids.append(f['id'])
                delta_values_used.append((_d, f['id'], rt_ref-f['rtime']))
                
    return explained_khipu_ids, explained_feature_ids, delta_values_used

In [51]:
isf_candidate_fragments = [x[0] for x in candidate_fragments]

explained_khipu_ids, explained_feature_ids, delta_values_used = explain_a_dataset_by_mz_deltas(list_khipus, 
                                                               remaining_features,
                                                               isf_candidate_fragments)
print(f'{len(set(explained_khipu_ids))} khipus are explained, {len(set(explained_feature_ids))} features are explained.')

784 khipus are explained, 1045 features are explained.


## Use MoNA to Explain

In [52]:
from matchms.importing import load_from_msp
from mass2chem.search import build_centurion_tree, find_all_matches_centurion_indexed_list

from asari.tools import match_features as mf
from asari.mass_functions import mass_paired_mapping, all_mass_paired_mapping
import tqdm
import logging
logging.getLogger("matchms").setLevel(logging.ERROR)


### Positive Ion Mode

In [53]:
# from JM
path = '../MoNA_MS2/filtered_MoNA-export-LC-MS-MS_Positive_Mode.msp'

spectral_registry = {}
total = 0
for x in tqdm.tqdm(load_from_msp(path)):
        try:
            inchikey = x.metadata_dict()['inchikey']
            if inchikey:
                if inchikey not in spectral_registry:
                    spectral_registry[inchikey] = []
                spectral_registry[inchikey].append(x)
                total += 1
        except:
            pass

print("filtered MS2 Size: ", str(len(spectral_registry)), " Compounds with ", str(total), " MS2 Spectra")

13973it [00:05, 2739.54it/s]

filtered MS2 Size:  13973  Compounds with  13973  MS2 Spectra





In [54]:
def extract_ms2_spectrum(sp, N=5):
    '''get precursor, name and top N (intensity, mz) from given ms2 spectrum

    sp: given spectrum to extract
    N: top N intensity:mz pair to extract
    
    return: (precursor mz, compound name, (intensity, mz) list). Ex.
            (248.0585,
            'Forchlorfenuron',
            [(100.0, 111.0553),
            (95.390586, 129.0214),
            (17.257158, 93.0448),
            (11.163815, 155.0007),
            (10.442742, 137.0346)])
    '''
    _d = sp.metadata_dict()
    _precursor, _name = _d['precursor_mz'], _d['compound_name']
    imz = zip(sp.peaks.intensities, sp.peaks.mz)
    imz = [x for x in imz if x[1] < _precursor - 0.01 and x[0] > 0.1] # excluding _precursor and small peaks
    return _precursor, _name, sorted(imz, reverse=True)[:N]


# ms2List is usable MoNA MS/MS compounds
ms2List, no_precursor = [], []
for sp in spectral_registry.values(): 
    try:
        ms2List.append(extract_ms2_spectrum(sp[0])) 
    except KeyError:
        no_precursor.append(sp[0])
        
print(f'{len(ms2List)} spectra are found with precursors.')

precursor_mzs = [x[0] for x in ms2List]
M0_khipu_feature_mzs = [get_M0(k['MS1_pseudo_Spectra'])['mz'] for k in list_khipus]
remaining_feature_mzs = [f['mz'] for f in remaining_features] # remaining feature also refers to singletons

# How many precursors match M0 khipu features
precursor_M0_map_list = all_mass_paired_mapping(precursor_mzs, M0_khipu_feature_mzs, std_ppm=5)[0]
precursor_remaining_feature_list = all_mass_paired_mapping(precursor_mzs, remaining_feature_mzs, std_ppm=5)[0]

print(f'{len(precursor_M0_map_list)} ({len(set([khipu_f[1] for khipu_f in precursor_M0_map_list]))/ len(set(M0_khipu_feature_mzs))}) M0 khipu features map with precursors; \n' +\
      f'{len(precursor_remaining_feature_list)} ({len(set([khipu_f[1] for khipu_f in precursor_remaining_feature_list]))/ len(set(remaining_feature_mzs))}) features outside khipus(singletons) map with precursors.')

13670 spectra are found with precursors.
3290 (0.3053053053053053) M0 khipu features map with precursors; 
15378 (0.44517194957777506) features outside khipus(singletons) map with precursors.


In [72]:
def find_match_ms2_from_mzs_in_rtbin(mzs_in_rtbin, ms2_fragments, limit_ppm=5):
    '''returns ms2 fragments that are matched in mzs_in_rtbin
    
    mzs_in_rtbin: list of mz values in a certain rt bin. Ex, [81.0178, 83.0863, 75.0996, 71.05, 72.0715]
    ms2_fragments: list of tuple (intenisty, mz). Ex. [(100.0, 104.05261), (73.74, 56.04967), (63.76, 133.03153), (26.88, 61.01076), (25.4, 102.05479)]
    
    note: ms2_precursor and khipu M0 should already match before this.
    
    return: list of matched mz values
    '''
    found = []
    if mzs_in_rtbin and ms2_fragments:
        for x in mzs_in_rtbin:
            for mz in ms2_fragments:
                if abs(mz-x['mz']) < 0.000001*limit_ppm*mz:
                    found.append((mz, x['id']))
    return found

def find_match_ms2_from_feature(all_representative_features, feature, rt_half_width, ms2_tree, limit_ppm=5):
    ''' Given a feature, find the ms2 values matched with features in the same rtime window.
    
    all_representative_features: list of all features, from which we are looking for featues in rtwindow
    feature: the given feature we want to find matched ms2 mzs
    rt_half_width: the radius we used for selecting features in rtwindow
    ms2_tree: the centurion tree built from MoNA positive MS2 spectra
    
    return list of tuple containing mapping information. Ex,
    [('F49362',
        {'mz': 655.457,
        'rtime': 0,
        'name': 'phorbol-12,13-didecanoate',
        'peaks': [(100.0, 311.164459),
            (55.231621, 199.111877),
            (49.650773, 293.153961),
            (39.600037, 107.086075),
            (31.224962, 265.158875)]},
        [265.158875])]
    '''
    in_rtwindow = get_features_in_rtwindow(all_representative_features, feature['rtime'], rt_half_width)
    in_rtwindow = [f for f in in_rtwindow if f['mz'] < feature['mz']*(1+limit_ppm*1e-6)] # all mzs in the rt window with M0 feature
    
    matched = []
    # find ms2-precursor-matched khipu
    matched_precursors = find_all_matches_centurion_indexed_list(feature['mz'], ms2_tree, limit_ppm=5)
    for p in matched_precursors:
        matched_ms2_mzs = find_match_ms2_from_mzs_in_rtbin(in_rtwindow, [x[1] for x in p['peaks']], 5) # [x[1] for x in p['peaks']] is the mz list under one precursor
        if matched_ms2_mzs:
            matched.append((feature['id_number'], p, matched_ms2_mzs))
    
    return matched

In [73]:
# build centurion tree accelerating the mapping
ms2_tree = build_centurion_tree([{'mz': x[0], 'rtime': 0, 'name': x[1], 'peaks': x[2]} for x in ms2List])

matched_m0_features = []
m0_features = [get_M0(k['MS1_pseudo_Spectra']) for k in list_khipus]
for m0_feature in m0_features:
    matched_m0_feature = find_match_ms2_from_feature(all_representative_features, m0_feature, 0.613, ms2_tree)
    if matched_m0_feature:
        matched_m0_features += matched_m0_feature

In [74]:
matched_remaining_features = []
for remain_f in remaining_features:
    matched_remaining_feature = find_match_ms2_from_feature(all_representative_features, remain_f, 0.613, ms2_tree)
    if matched_remaining_feature:
        matched_remaining_features += matched_remaining_feature

In [75]:
_matched_ms2_peaks = []
for m in matched_m0_features+matched_remaining_features:
    _matched_ms2_peaks += m[2]
    
print(f'{len(set(_matched_ms2_peaks))} ms2 peaks are matched on any of the ms1 peaks, out of {len(matched_m0_features+matched_remaining_features)} precursors')
print(f'This dataset has {len(all_representative_features)} features and {len(list_khipus)} khipus.')
print('Using 13670 cpds from MoNA.')

1243 ms2 peaks are matched on any of the ms1 peaks, out of 1057 precursors
This dataset has 49077 features and 6254 khipus.
Using 13670 cpds from MoNA.
