# Experiment: Varying N in top-N DDA fragmentation

We demonstrate that the simulator can be used for scan-level closed-loop DDA experiments. 
- Take an existing data. Find out which MS1 peaks are linked to which MS2 peaks.
- Run all MS1 peaks through the simulator’s Top-N protocol. 
- For the top-100 most intense MS1 peaks, how many got fragmented in the simulator as we change N?
- If N is greater than the real data, do we see the same MS1 peaks from (1) being fragmented again, plus additional fragment peaks?
- Verification on actual machine.
- Talk to stefan about machine time.

In [None]:
%matplotlib inline

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import sys
import os
import scipy.stats
import pylab as plt
from IPython import display
import pylab as plt
from random import random, shuffle
from joblib import Parallel, delayed
import multiprocessing

In [None]:
sys.path.append('../codes')

In [None]:
from VMSfunctions.Chemicals import *
from VMSfunctions.Chromatograms import *
from VMSfunctions.MassSpec import *
from VMSfunctions.Controller import *
from VMSfunctions.Common import *
from VMSfunctions.DataGenerator import *
from VMSfunctions.Noise import *

In [None]:
set_log_level_debug()

### Load densities trained on beer1pos data (see [loader_kde](loader_kde.ipynb)).

In [None]:
base_dir = 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\'

In [None]:
ps = load_obj(base_dir + 'Trained Models\\peak_sampler_beer1pos_fragmentation.p')

### Load prepared data (BEER1POS)

In [None]:
dataset = load_obj('../models/dda_results/dataset.p')
noisy_dataset = load_obj('../models/dda_results/noisy_dataset.p')

### Experiment by varying N and rt_tol

Don't print so much from the controller when running

In [None]:
set_log_level_warning()
# set_log_level_info()
# set_log_level_debug()

#### Run serially

In [None]:
def run_serial_experiment(param, i, total):
    analysis_name = 'experiment_N_%d_rttol_%d' % (param['N'], param['rt_tol'])    
    mzml_out = '../models/dda_results/%s.mzML' % analysis_name    
    pickle_out = '../models/dda_results/%s.p' % analysis_name

    if os.path.isfile(mzml_out) and os.path.isfile(pickle_out): 
        print('Skipping\t%d/%d\t%s' % (i, total, analysis_name))
    else:    
        print('Processing\t%d/%d\t%s' % (i, total, analysis_name))
        mass_spec = IndependentMassSpectrometer(param['ionisation_mode'], param['data'], density=param['density'])
        controller = TopNController(mass_spec, param['N'], param['isolation_window'], param['rt_tol'], 
                                    param['min_ms1_intensity'])
        controller.run(param['min_rt'], param['max_rt'], progress_bar=True)
        controller.write_mzML(analysis_name, mzml_out)    
        save_obj(controller, pickle_out)

#### Run in parallel

In [None]:
def run_par(param):
    analysis_name = 'experiment_N_%d_rttol_%d' % (param['N'], param['rt_tol'])    
    mzml_out = '../models/dda_results/%s.mzML' % analysis_name    
    pickle_out = '../models/dda_results/%s.p' % analysis_name

    if os.path.isfile(mzml_out) and os.path.isfile(pickle_out): 
        print('Skipping %s' % (analysis_name))
    else:    
        print('Processing %s' % (analysis_name))
        mass_spec = IndependentMassSpectrometer(param['ionisation_mode'], param['data'], density=param['density'])
        controller = TopNController(mass_spec, param['N'], param['isolation_window'], param['rt_tol'])
        controller.run(param['min_rt'], param['max_rt'], progress_bar=False)
        controller.write_mzML(analysis_name, mzml_out)  
        save_obj(controller, pickle_out)
        return analysis_name

In [None]:
def run_parallel_experiment(params):
    import ipyparallel as ipp
    rc = ipp.Client()
    dview = rc[:] # use all engines

    with dview.sync_imports():
        from VMSfunctions.MassSpec import IndependentMassSpectrometer
        from VMSfunctions.Controller import TopNController
        import os
        
    analysis_names = dview.map_sync(run_par, params)
    for analysis_name in analysis_names:
        print(analysis_name)

#### Set parameters

Varying parameters

In [None]:
def get_params(Ns, rt_tols, isolation_window, ionisation_mode, data, density, min_ms1_intensity, min_rt, max_rt):
    print('N =', Ns)
    print('rt_tol =', rt_tols)    
    params = []
    for N in Ns:
        for rt_tol in rt_tols:
            params.append({
                'N': N, 
                'rt_tol': rt_tol,
                'min_ms1_intensity': min_ms1_intensity,
                'isolation_window': isolation_window,
                'ionisation_mode': ionisation_mode,
                'data': data,
                'density': density,
                'min_rt': min_rt,
                'max_rt': max_rt
            })
    print('len(params) =', len(params))
    return params

In [None]:
isolation_window = 0.5   # the isolation window in Dalton around a selected precursor ion
ionisation_mode = POSITIVE
data = noisy_dataset
density = ps.density_estimator
min_ms1_intensity = 2.5E5 # minimum ms1 intensity to fragment
min_rt = 3*60
max_rt = 21*60

Varying Ns

In [None]:
Ns = list(range(2, 101, 2)) # top-N DDA fragmentation
rt_tols = [15] # the rt window around a selected precursor ion to prevent it from fragmented multiple times
params = get_params(Ns, rt_tols, isolation_window, ionisation_mode, data, density, min_ms1_intensity, min_rt, max_rt)

In [None]:
for i in range(len(params)):
    param = params[i]
    run_serial_experiment(param, i+1, len(params))

In [None]:
run_parallel_experiment(params)

Grid search

In [None]:
# Ns = list(range(2, 51, 2)) # top-N DDA fragmentation
# rt_tols = list(range(5, 31, 5)) # the rt window around a selected precursor ion to prevent it from fragmented multiple times
# params = get_params(Ns, rt_tols, isolation_window, ionisation_mode, data, density, min_ms1_intensity, min_rt, max_rt)

In [None]:
# for i in range(len(params)):
#     param = params[i]
#     run_serial_experiment(param, i+1, len(params))