# Extract the MS2 scan and do signature neutral loss search
- run multiple files
- need to define a precursor target list & scan range (let's just try to do more inclusive one)

In [12]:
# !pip install pymzml
%matplotlib inline

In [13]:
import numpy as np
from matplotlib import pyplot as plt
import pymzml
import os
import pandas as pd
import sys

In [14]:
sys.path.append('../pyutils/')  # add the second parent directory
from searchMS2 import *

In [15]:
# spectrum.selected_precursors
# [{'mz': 417.320373535156, 'charge': 1, 'precursor id': '2'}]

In [16]:
dir_path = "../../../../MS_MS_raw_mzML/AX_RPneg_TDcells/mzML/"
files_and_dirs = os.listdir(dir_path)

# Get the full path for each file/directory in the list
infiles = [os.path.join(dir_path, file) for file in files_and_dirs]

In [17]:
exps = []
for infile in infiles:
    exps.append(pymzml.run.Reader(infile))



## Basic information - using the first instance

In [18]:
exp = pymzml.run.Reader(infiles[0])



In [19]:
# How many spectra
exp.get_spectrum_count()

4008

In [20]:
exp.ms_precisions # ms-level 1: ppm 5; ms-level 2: ppm 20?

{None: 0.0001, 0: 0.0001, 1: 5e-06, 2: 2e-05}

##### summary
- a probably 12 min run (720 secs) for 4008 scans
- 0.17964 second/scan

## Load target list

In [21]:
df_sel = pd.read_csv("./data/input/targeted_extraction__Feature_table.tsv", sep = '\t')

In [22]:
df_sel.head(10)

Unnamed: 0,query_target,id_number,mz,rtime,rtime_left_base,rtime_right_base,parent_masstrack_id,peak_area,cSelectivity,goodness_fitting,...,MT_20230308_012,MT_20230308_014,MT_20230308_016,MT_20230308_018,MT_20230308_020,MT_20230308_022,MT_20230308_024,MT_20230308_026,MT_20230308_028,MT_20230308_030
0,482.288282,F1614,482.289,207.04,204.92,209.41,1335,4365548,0.88,0.85,...,151268,66061,548649,448802,198916,441632,844487,167207,43418,395143
1,498.283197,F3712,498.2843,33.72,32.23,36.32,1567,18072147,1.0,0.94,...,702748,621143,2266499,942310,1587450,1069759,1205834,0,0,1426358
2,524.298847,F1779,524.2982,96.47,93.85,99.13,1928,36918657,0.76,0.97,...,2827464,2817111,2754368,2156389,2912621,2697065,2971851,3236482,2724970,3001170
3,538.350882,F3031,538.3514,107.59,104.85,110.34,2126,30389790,0.92,0.98,...,0,0,0,0,0,0,0,0,0,0
4,540.330147,F3138,540.3305,70.18,67.44,72.92,2147,29371349,0.95,0.98,...,2352009,2308541,0,0,0,0,0,2322796,2729822,0
5,552.330147,F4105,552.3305,61.26,58.52,64.01,2291,54801491,1.0,0.99,...,0,0,0,0,0,0,0,0,0,0
6,554.345797,F4190,554.3468,82.07,79.32,84.58,2315,218470611,0.58,0.95,...,0,0,0,0,0,0,0,0,0,0
7,566.309412,F5441,566.3086,162.45,159.81,165.13,2486,11584313,1.0,0.99,...,0,0,2694084,0,1992950,0,1742897,0,0,2018356
8,568.361447,F5580,568.3622,113.77,111.02,116.47,2517,32860249,0.93,0.98,...,0,0,0,0,0,0,0,0,0,0
9,572.298847,F5983,572.2993,217.45,214.85,219.84,2580,25336616,1.0,0.94,...,1290411,1450712,3675461,1596170,2193715,1389912,2101354,0,1530704,2716537


In [23]:
FTIDs = df_sel['id_number'].tolist()

In [24]:
df_feat_annot = pd.read_csv("../../../../asari-output-RPneg/output_asari_project_31312361/Feature_annotation.tsv",
                           sep = '\t', index_col=0)

In [25]:
# Select the desired rows and columns from the DataFrame
subset = df_feat_annot.loc[FTIDs, ['mz', 'rtime', 'apex(scan number)']]
# drop duplicates
subset_dd = subset.drop_duplicates() 
# Convert the subset to a dictionary with row-wise orientation
target_dict = subset_dd.to_dict(orient='index')

In [26]:
target_dict

{'F1614': {'mz': 482.289032, 'rtime': 207.0361458, 'apex(scan number)': 930},
 'F3712': {'mz': 498.2842712, 'rtime': 33.7166375, 'apex(scan number)': 151},
 'F1779': {'mz': 524.2982025, 'rtime': 96.46787131, 'apex(scan number)': 429},
 'F3031': {'mz': 538.3514099, 'rtime': 107.5941878, 'apex(scan number)': 478},
 'F3138': {'mz': 540.3304749, 'rtime': 70.18314622, 'apex(scan number)': 312},
 'F4105': {'mz': 552.3305359, 'rtime': 61.26445373, 'apex(scan number)': 273},
 'F4190': {'mz': 554.3468018, 'rtime': 82.06768434, 'apex(scan number)': 364},
 'F5441': {'mz': 566.3086395, 'rtime': 162.4463086, 'apex(scan number)': 723},
 'F5580': {'mz': 568.3622437, 'rtime': 113.7691743, 'apex(scan number)': 505},
 'F5983': {'mz': 572.2992706, 'rtime': 217.4495763, 'apex(scan number)': 978},
 'F6287': {'mz': 576.3304749, 'rtime': 53.03482256, 'apex(scan number)': 237},
 'F6579': {'mz': 578.3467102, 'rtime': 69.72456917, 'apex(scan number)': 310},
 'F6913': {'mz': 580.3623352, 'rtime': 89.28646261, 'a

In [27]:
list(target_dict.items())[0]

('F1614', {'mz': 482.289032, 'rtime': 207.0361458, 'apex(scan number)': 930})

## get potential precursor from multiple files

In [28]:
res_dict = {}
for exp in exps:
    for k,v in target_dict.items():
        spectra = get_potental_precursor_from_exp(exp, 
                              min_scan_number = v['apex(scan number)']-1000, 
                              max_scan_number = v['apex(scan number)']+1000, 
                              min_mz = np.floor(v['mz']), 
                              max_mz = np.ceil(v['mz']), 
                              ms_level=2)
        res_dict[k] = spectra



In [29]:
res_dict

{'F1614': [],
 'F3712': [],
 'F1779': [],
 'F3031': [],
 'F3138': [<__main__.Spectrum object with native ID 406 at 0x7f8975f210a0>,
  <__main__.Spectrum object with native ID 408 at 0x7f8975f211f0>],
 'F4105': [],
 'F4190': [],
 'F5441': [],
 'F5580': [],
 'F5983': [],
 'F6287': [],
 'F6579': [],
 'F6913': [],
 'F7086': [],
 'F6328': [<__main__.Spectrum object with native ID 1411 at 0x7f8975f34bb0>,
  <__main__.Spectrum object with native ID 1413 at 0x7f8976137b20>],
 'F7618': [],
 'F5337': [],
 'F5343': [<__main__.Spectrum object with native ID 1475 at 0x7f8975f487f0>,
  <__main__.Spectrum object with native ID 1477 at 0x7f8976116c40>],
 'F7394': [],
 'F9383': [],
 'F10088': [<__main__.Spectrum object with native ID 1041 at 0x7f8976116a90>,
  <__main__.Spectrum object with native ID 1043 at 0x7f8976116640>,
  <__main__.Spectrum object with native ID 1250 at 0x7f8975f34b80>,
  <__main__.Spectrum object with native ID 1252 at 0x7f8976129d90>,
  <__main__.Spectrum object with native ID 1

In [30]:
# items that don't have spectra associated
[k for k,v in res_dict.items() if len(v) == 0 ]

['F1614',
 'F3712',
 'F1779',
 'F3031',
 'F4105',
 'F4190',
 'F5441',
 'F5580',
 'F5983',
 'F6287',
 'F6579',
 'F6913',
 'F7086',
 'F7618',
 'F5337',
 'F7394',
 'F9383',
 'F9945',
 'F11972',
 'F12271',
 'F12819',
 'F9458',
 'F9861',
 'F11304',
 'F11681',
 'F11849',
 'F12086',
 'F12928',
 'F13352',
 'F13368',
 'F9678',
 'F9994',
 'F10628',
 'F10649',
 'F10790',
 'F10809',
 'F11006',
 'F11498',
 'F11746',
 'F12027',
 'F12261',
 'F12389',
 'F12441',
 'F12902',
 'F13048',
 'F13079',
 'F13488',
 'F12900',
 'F12910',
 'F13113',
 'F13804',
 'F14097',
 'F14607',
 'F14831',
 'F14998',
 'F15014',
 'F15086',
 'F15244',
 'F15257',
 'F15440',
 'F15464',
 'F15666',
 'F15734',
 'F16070',
 'F16127',
 'F16286',
 'F16333',
 'F16346',
 'F16535',
 'F16561',
 'F17105',
 'F17235',
 'F17396',
 'F17530',
 'F13805',
 'F13961',
 'F14243',
 'F14588',
 'F15491',
 'F15637',
 'F15865',
 'F16120']

In [31]:
res_dict_clean = {k:v for k,v in res_dict.items() if len(v) != 0}

In [32]:
res_dict_clean

{'F3138': [<__main__.Spectrum object with native ID 406 at 0x7f8975f210a0>,
  <__main__.Spectrum object with native ID 408 at 0x7f8975f211f0>],
 'F6328': [<__main__.Spectrum object with native ID 1411 at 0x7f8975f34bb0>,
  <__main__.Spectrum object with native ID 1413 at 0x7f8976137b20>],
 'F5343': [<__main__.Spectrum object with native ID 1475 at 0x7f8975f487f0>,
  <__main__.Spectrum object with native ID 1477 at 0x7f8976116c40>],
 'F10088': [<__main__.Spectrum object with native ID 1041 at 0x7f8976116a90>,
  <__main__.Spectrum object with native ID 1043 at 0x7f8976116640>,
  <__main__.Spectrum object with native ID 1250 at 0x7f8975f34b80>,
  <__main__.Spectrum object with native ID 1252 at 0x7f8976129d90>,
  <__main__.Spectrum object with native ID 1290 at 0x7f8975f34760>,
  <__main__.Spectrum object with native ID 1292 at 0x7f8976129ee0>,
  <__main__.Spectrum object with native ID 1371 at 0x7f8975f21520>,
  <__main__.Spectrum object with native ID 1373 at 0x7f8975f34f10>],
 'F12237'

In [33]:
# this function only works for situation where you look at charge state = 1
def search_NL(spectra,
              NL_mz = 87.03124,
              ppm = 50):
    res_data = []

    for spec in spectra:
        selected_precursor_mz = spec.selected_precursors[0]['mz'] # this m/z will not be exactly the precursor m/z
        sel_mz = [mz for mz in spec.mz if abs(mz - selected_precursor_mz) < np.ceil(NL_mz)]

        for mz in sel_mz:
            calc_ppm = abs(((selected_precursor_mz - mz) - NL_mz)*1000000/NL_mz)
            if calc_ppm < ppm:
                print(calc_ppm)
                res_data.append(spec)
                break
    return res_data

In [34]:
res_dict = {}
for FTID, spectra in res_dict_clean.items():
    res_dict.update({FTID: search_NL(spectra)})

26.063064038620475
26.764365734999846
22.55655555672363
23.257857253102998


In [35]:
res_dict

{'F3138': [],
 'F6328': [],
 'F5343': [],
 'F10088': [],
 'F12237': [],
 'F10486': [],
 'F10386': [],
 'F11014': [],
 'F12299': [],
 'F13180': [],
 'F13184': [],
 'F13523': [],
 'F13545': [],
 'F9693': [],
 'F10004': [],
 'F10378': [],
 'F10998': [],
 'F12641': [],
 'F13395': [],
 'F13638': [],
 'F13658': [],
 'F13132': [],
 'F13136': [],
 'F13468': [<__main__.Spectrum object with native ID 1685 at 0x7f8976129d00>,
  <__main__.Spectrum object with native ID 1687 at 0x7f8975f34af0>,
  <__main__.Spectrum object with native ID 1705 at 0x7f8975f482b0>,
  <__main__.Spectrum object with native ID 1707 at 0x7f8976137e20>],
 'F14354': [],
 'F16926': []}

# plot accordingly

In [36]:
res_dict_clean.keys()

dict_keys(['F3138', 'F6328', 'F5343', 'F10088', 'F12237', 'F10486', 'F10386', 'F11014', 'F12299', 'F13180', 'F13184', 'F13523', 'F13545', 'F9693', 'F10004', 'F10378', 'F10998', 'F12641', 'F13395', 'F13638', 'F13658', 'F13132', 'F13136', 'F13468', 'F14354', 'F16926'])

In [37]:
df_sel.index = df_sel['id_number']

In [38]:
df_sel.columns[0:6]

Index(['query_target', 'id_number', 'mz', 'rtime', 'rtime_left_base',
       'rtime_right_base'],
      dtype='object')

In [39]:
for FTID in res_dict_clean.keys():
    plot_spectra(spectra = res_dict_clean[FTID], 
                 save_figure=True,
                 output_path=f"./data/output/{FTID}_{df_sel.loc[FTID,'Species.Shorthand']}/",
                 label = f"{FTID}_{df_sel.loc[FTID,'Species.Shorthand']}"
                 )

KeyError: 'Species.Shorthand'