### Fixing mzML writing

In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import sys
import scipy.stats
import pylab as plt
from IPython import display
import pylab as plt

In [4]:
sys.path.append('../codes')

In [5]:
from VMSfunctions.Chemicals import *
from VMSfunctions.Chromatograms import *
from VMSfunctions.MassSpec import *
from VMSfunctions.Controller import *
from VMSfunctions.Common import *
from VMSfunctions.DataGenerator import *

### PsiMS test

In [6]:
from psims.mzml import MzMLWriter

In [7]:
def get_next_arrays():
    return np.random.rand(100), np.random.rand(100)

In [8]:
writer = MzMLWriter("../models/test/mzML/write.mzML")
with writer:
    writer.controlled_vocabularies()
    writer.file_description([ # the list of file contents terms
        "MS1 spectrum",
        "MSn spectrum",
        "centroid spectrum"
    ])
    writer.software_list([
        {"id": "psims-writer",
         "version": "0.1.2",
         "params": [
            "python-psims",
        ]}
    ])   
    source = writer.Source(1, ["electrospray ionization", "electrospray inlet"])
    analyzer = writer.Analyzer(2, [
            "fourier transform ion cyclotron resonance mass spectrometer"
        ])
    detector = writer.Detector(3, ["inductive detector"])
    config = writer.InstrumentConfiguration(id="IC1", component_list=[source, analyzer, detector],
                                            params=["LTQ-FT"])
    writer.instrument_configuration_list([config])

#     methods = []
#     methods.append(
#         writer.ProcessingMethod(
#             order=1, sofware_reference="psims-writer", params=[
#                 "Gaussian smoothing",
#                 "median baseline reduction",
#                 "MS:1000035", # peak picking
#                 "Conversion to mzML"
#             ]))
#     processing = writer.DataProcessing(methods, id='DP1')
#     writer.data_processing_list([processing])
    writer.data_processing_list({'id': 'VMS'})    

    with writer.run(id=1, instrument_configuration='IC1'):
        # we will write 3,000 spectra
        with writer.spectrum_list(count=1e3):
            i = 0
        while i < 1e3:
            i += 1
            ms1_mzs, ms1_intensities = get_next_arrays()
            ms1_spectrum_id = "index=%d" % i
            scan_time = 0.23 * i

            writer.write_spectrum(
                ms1_mzs, ms1_intensities, id=ms1_spectrum_id, centroided=True,
                scan_start_time=scan_time, scan_window_list=[(0, 2000.0)],
                params=[{"ms level": 1}, {"total ion current": ms1_intensities.sum()}])

            for j in range(3):
                i += 1
                msn_mzs, msn_intensities = get_next_arrays()
                msn_spectrum_id = "index=%d" % i
                scan_time = 0.23 * i

                k = np.random.randint(len(ms1_mzs))
                precursor_info = {
                    "mz": ms1_mzs[k], 'intensity': ms1_intensities[k], 'charge': 1,
                    "spectrum_reference": ms1_spectrum_id,
                    "activation": ["HCD", {"collision energy": 25.0}],
#                     "isolation_window": (1.0, ms1_mzs[k], 1.0)
                }

                writer.write_spectrum(
                    msn_mzs, msn_intensities, id=msn_spectrum_id, centroided=True,
                    scan_start_time=scan_time, scan_window_list=[(0, 2000.0)],
                    precursor_information=precursor_info,
                    params=[{"ms level": 2}, {"total ion current": msn_intensities.sum()}])

In [9]:
writer.close()

### Top-N Controller Test

In [None]:
set_log_level_debug()

Load densities trained on 19 beer data (see [loader_kde](loader_kde.ipynb)).

In [None]:
base_dir = 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\'

In [None]:
ps = load_obj(os.path.join(base_dir, 'Trained Models\\peak_sampler_19_beers_fullscan.p'))

### Generate some chemicals

In [None]:
cc = ChromatogramCreator(os.path.join(base_dir, 'Trained Models\\chromatogram_19_beers.csv.gz'))

In [None]:
hmdb = load_obj(os.path.join(base_dir, 'Trained Models\\hmdb_compounds.p'))

### Set up a Top-N controller

We can run the dataset through a top-N DDA controller

In [None]:
# set_log_level_warning()
# set_log_level_info()
set_log_level_debug()

In [None]:
ps = load_obj(os.path.join(base_dir, 'Trained Models\\peak_sampler_19_beers_fragmentation.p'))

In [None]:
min_ms1_intensity = 2.5E5
rt_range = [(3*60, 21*60)]
mz_range = [(0, 1050)]
n_ms1_peaks = 10000

# alpha = 0.1
# compound_list = hmdb
# chemical_type = 'Known'

alpha = math.inf
compound_list = None
chemical_type = 'Unknown'
ms_levels = 2

chemicals = ChemicalCreator(ps)
dataset = chemicals.sample(cc, mz_range, rt_range, min_ms1_intensity, n_ms1_peaks, ms_levels=ms_levels, 
                           chemical_type=chemical_type, formula_list=None, compound_list=compound_list, alpha=alpha, fixed_mz=False)

In [None]:
N = 5                           # top-5 DDA fragmentation
mz_tol = 5                      # the mz isolation window (Dalton) around a selected precursor ion
rt_tol = 15                     # the rt window (second) around a selected precursor ion to prevent it from fragmented multiple times
min_ms1_intensity = 2.5E5       # the minimum ms1 peak intensity to be fragmented

In [None]:
set_log_level_warning()

In [None]:
mass_spec = IndependentMassSpectrometer(POSITIVE, dataset, density=ps.density_estimator)
controller = TopNController(mass_spec, N, mz_tol, rt_tol, min_ms1_intensity)
controller.make_plot = False
controller.run(rt_range[0][0], rt_range[0][1])

In [None]:
controller.write_mzML('my_analysis', '../models/test/mzML/out.mzML')