# Experiment: Varying N in top-N DDA fragmentation

We demonstrate that the simulator can be used for scan-level closed-loop DDA experiments. 
- Take an existing data. Find out which MS1 peaks are linked to which MS2 peaks.
- Run all MS1 peaks through the simulator’s Top-N protocol. 
- For the top-100 most intense MS1 peaks, how many got fragmented in the simulator as we change N?
- If N is greater than the real data, do we see the same MS1 peaks from (1) being fragmented again, plus additional fragment peaks?
- Verification on actual machine.
- Talk to stefan about machine time.

In [None]:
%matplotlib inline

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import sys
import os
import scipy.stats
import pylab as plt
from IPython import display
import pylab as plt
from random import random, shuffle
from joblib import Parallel, delayed
import multiprocessing

In [None]:
sys.path.append('../codes')

In [None]:
from VMSfunctions.Chemicals import *
from VMSfunctions.Chromatograms import *
from VMSfunctions.MassSpec import *
from VMSfunctions.Controller import *
from VMSfunctions.Common import *
from VMSfunctions.DataGenerator import *
from VMSfunctions.Noise import *

In [None]:
set_log_level_debug()

### Load densities trained on beer1pos data (see [loader_kde](loader_kde.ipynb)).

In [None]:
base_dir = 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\'

In [None]:
ps = load_obj(base_dir + 'Trained Models\\peak_sampler_beer1pos_fragmentation.p')

### Load chromatogram data exported from the real data (BEER1POS)

Correct the positively charged ions by substracting the mass of a proton

In [None]:
def f(peak_mz, chrom_mzs):
    peak_mz = peak_mz - PROTON_MASS
    chrom_mzs = chrom_mzs - PROTON_MASS
    return peak_mz, chrom_mzs

In [None]:
xcms_output = base_dir + 'Trained Models\\chromatogram_beer1pos.csv.gz'
cc = ChromatogramCreator(xcms_output, correction_func=f)

#### Turn the loaded data into `UnknownChemical` objects.

In [None]:
min_ms1_intensity = 0
min_rt = 3*60
max_rt = 21*60
chemicals = ChemicalCreator(ps)
dataset = chemicals.sample_from_chromatograms(cc, min_rt, max_rt, min_ms1_intensity, ms_levels=2)

In [None]:
len(dataset)

### Add noise

#### Create region of interest objects

In [None]:
mzml_path = base_dir + 'Data\\multibeers_urine_data\\beers\\fullscan\\'
xcms_roi_file = mzml_path + 'rois.csv'
extracted_roi_file = base_dir + 'Trained Models\\beer1pos_rois.p'

In [None]:
filename = 'Beer_multibeers_1_fullscan1.mzML'
ds = DataSource()
ds.load_data(mzml_path)
try:
    ds.load_roi(extracted_roi_file)
except FileNotFoundError:
    print('Extracting ROIs')
    ds.extract_roi(xcms_roi_file, min_rt=min_rt, max_rt=max_rt, filename=filename)
    ds.populate_roi(filename=filename)
    ds.save_roi(extracted_roi_file)

#### Add a whole bunch of `UnknownChemicals` with chromatograms coming from non-peak ROIs from Beer1Pos

In [None]:
min_ms1_intensity = 2.5E5

In [None]:
rtcc = RoiToChemicalCreator(ps, ds, filename,  min_ms1_intensity=min_ms1_intensity)

In [None]:
len(rtcc.chemicals)

In [None]:
def plot_chems(chem_list, N=10):
    for c in chem_list[0:N]:
        chrom = c.chromatogram
        plt.plot(chrom.raw_rts, chrom.raw_intensities)
        plt.show()

In [None]:
sorted_chems = sorted(rtcc.chemicals, key = lambda chem: chem.chromatogram.roi.num_scans())

In [None]:
plot_chems(sorted_chems[0:10])

In [None]:
plot_chems(sorted_chems[-10:])

#### Add non-peaks regions of interest to the data

In [None]:
for chem in dataset:
    chem.type = 'data'
for noise in sorted_chems:
    noise.type = 'noise'

In [None]:
noisy_dataset = dataset + sorted_chems

In [None]:
# shuffle(noisy_dataset)

In [None]:
len(dataset)

In [None]:
len(noisy_dataset)

In [None]:
save_obj(dataset, '../models/dda_results/dataset.p')

In [None]:
save_obj(noisy_dataset, '../models/dda_results/noisy_dataset.p')

### Experiment by varying N and rt_tol

Don't print so much from the controller when running

In [None]:
set_log_level_warning()
# set_log_level_info()
# set_log_level_debug()

#### Set parameters

Varying parameters

In [None]:
Ns = list(range(2, 51, 2)) # top-N DDA fragmentation
rt_tols = list(range(5, 31, 5)) # the rt window around a selected precursor ion to prevent it from fragmented multiple times

In [None]:
print(Ns)
print(rt_tols)

Fixed parameters

In [None]:
isolation_window = 0.5   # the isolation window in Dalton around a selected precursor ion
ionisation_mode = POSITIVE
data = noisy_dataset
density = ps.density_estimator
min_ms1_intensity = 2.5E5 # minimum ms1 intensity to fragment

In [None]:
params = []
for N in Ns:
    for rt_tol in rt_tols:
        params.append({
            'N': N, 
            'rt_tol': rt_tol,
            'min_ms1_intensity': min_ms1_intensity,
            'isolation_window': isolation_window,
            'ionisation_mode': ionisation_mode,
            'data': data,
            'density': density,
            'min_rt': min_rt,
            'max_rt': max_rt
        })

In [None]:
len(params)

#### Run serially

In [None]:
def run_serial_experiment(param, i, total):
    analysis_name = 'experiment_N_%d_rttol_%d' % (param['N'], param['rt_tol'])    
    mzml_out = '../models/dda_results/%s.mzML' % analysis_name    
    pickle_out = '../models/dda_results/%s.p' % analysis_name

    if os.path.isfile(mzml_out) and os.path.isfile(pickle_out): 
        print('Skipping\t%d/%d\t%s' % (i, total, analysis_name))
    else:    
        print('Processing\t%d/%d\t%s' % (i, total, analysis_name))
        mass_spec = IndependentMassSpectrometer(param['ionisation_mode'], param['data'], density=param['density'])
        controller = TopNController(mass_spec, param['N'], param['isolation_window'], param['rt_tol'], 
                                    param['min_ms1_intensity'])
        controller.run(param['min_rt'], param['max_rt'], progress_bar=True)
        controller.write_mzML(analysis_name, mzml_out)    
        save_obj(controller, pickle_out)

In [None]:
for i in range(len(params)):
    param = params[i]
    run_serial_experiment(param, i+1, len(params))

#### Run in parallel

Doesn't quite work yet ...

In [None]:
# import ipyparallel as ipp

In [None]:
# def run_parallel_experiment(param):
#     import sys
#     sys.path.append('C:\\Users\\joewa\\Work\\git\\clms\\Simulator\\codes')
#     from VMSfunctions.MassSpec import IndependentMassSpectrometer
#     from VMSfunctions.Controller import TopNController
    
#     mass_spec = IndependentMassSpectrometer(param['ionisation_mode'], param['data'], density=param['density'])
#     controller = TopNController(mass_spec, param['N'], param['isolation_window'], param['rt_tol'])
#     controller.run(param['min_rt'], param['max_rt'], progress_bar=False)
#     return controller

In [None]:
# rc = ipp.Client()
# dview = rc[:] # use all engines

In [None]:
# controllers = dview.map(run_parallel_experiment, params)
# for controller in controllers:
#     print(controller.N, controller.rt_tol)