In [1]:
import numpy as np
import sys
import scipy.stats
import re

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
sys.path.append('../codes')

In [4]:
%pprint

Pretty printing has been turned OFF


In [5]:
from VMSfunctions.Common import *
from VMSfunctions.Chemicals import *
from VMSfunctions.Chromatograms import *
from VMSfunctions.MassSpec import *
from VMSfunctions.Controller import *

# Mass Spec test

In [6]:
import pickle
hmdb = pickle.load(open('../../../Compounds/hmdb_compounds.p','rb'))

In [7]:
xcms_output = '../models/beer_ms1_peaks.csv.gz'
cc = ChromatogramCreator(xcms_output)

In [8]:
ps = load_obj('../models/peak_sampler_4_beers.p')
chemicals = ChemicalCreator(ps)



# Improve Chemical Creator

Need to test the chemical creator in more situations to make sure the code works as intended

In [9]:
min_ms1_intensity = 2E5
rt_range = [(3*60, 21*60)]
mz_range = [(0, 1000)]
set_log_level_debug()
dataset = chemicals.sample(cc, mz_range, rt_range, min_ms1_intensity, 200, 2, "Known", None, hmdb, 0.1)

DEBUG:ChemicalCreator:200 ms1 peaks to be created.
DEBUG:ChemicalCreator:i = 0
DEBUG:ChemicalCreator:i = 25
DEBUG:ChemicalCreator:i = 50
DEBUG:ChemicalCreator:i = 75
DEBUG:ChemicalCreator:i = 100
DEBUG:ChemicalCreator:i = 125
DEBUG:ChemicalCreator:i = 150
DEBUG:ChemicalCreator:i = 175


In [10]:
#min_ms1_intensity = 2E5
#min_rt = 3*60
#max_rt = 21*60
#chemicals = ChemicalCreator(ps)
#dataset = chemicals.sample_from_chromatograms(cc, min_rt, max_rt, min_ms1_intensity, ms_levels=2)

# Test Top N Controller

In [None]:
max_rt = 10*60                  # the maximum retention time of scans to generate
N = 1                           # top-5 DDA fragmentation
mz_tol = 5                      # the mz isolation window around a selected precursor ion
rt_tol = 15                     # the rt window around a selected precursor ion to prevent it from fragmented multiple times
min_ms2_intensity = 5000        # the minimum ms2 peak intensity

mass_spec = IndependentMassSpectrometer(POSITIVE, dataset, density=ps.density_estimator)
controller = TopNController(mass_spec, N, mz_tol, rt_tol, min_ms2_intensity=min_ms2_intensity)

set_log_level_warning() # We don't want to see too many messages as the controller is running
# set_log_level_info()
# set_log_level_debug()

controller.run(0, max_rt)

In [11]:
controller.write_mzML('my_analysis2', '../models/out2.mzML')
# save_obj(controller, '../models/noisy_top_N_controllers.p')

# DIA Mass Spec

In [None]:
dia_design = "basic"
window_type = "even"
kaufmann_design = None
extra_bins = 0
num_windows=10

mass_spec = IndependentMassSpectrometer(POSITIVE, dataset, density=ps.density_estimator)
controller = TreeController(mass_spec, dia_design, window_type, kaufmann_design, extra_bins, num_windows)

set_log_level_warning() # We don't want to see too many messages as the controller is running
# set_log_level_info()
# set_log_level_debug()

controller.run(max_rt)

# Multiple Datasets for DsDA

In [None]:
min_ms1_intensity = 2E5
rt_range = [(3*60, 21*60)]
mz_range = [(0, 1000)]
set_log_level_debug()
dataset = chemicals.sample(cc, mz_range, rt_range, min_ms1_intensity, 200, 2, "Known", None, hmdb, 0.1)

In [19]:
n_samples = [10]
classes = ["class1"]
intensity_noise_sd = [1000]
change_probabilities = None
change_differences_means = None
change_differences_sds = None
dropout_probabilities = None 
experimental_classes = None 
experimental_probabilitities = None 
experimental_sds = None

multiple_samples = MultiSampleCreator(dataset, n_samples, classes, intensity_noise_sd, 
                                      change_probabilities, change_differences_means, change_differences_sds, dropout_probabilities,
                                     experimental_classes, experimental_probabilitities, experimental_sds)

In [23]:
print(multiple_samples.samples[0][0])
print(multiple_samples.samples[1][0])
print(multiple_samples.samples[2][0])

['unchanged' 'unchanged' 'unchanged' 'unchanged' 'unchanged' 'unchanged'
 'unchanged' 'unchanged' 'unchanged' 'unchanged']

KnownChemical - 'C21H41NO3' rt=440.74 max_intensity=831102.18
KnownChemical - 'C21H41NO3' rt=440.74 max_intensity=833599.33
KnownChemical - 'C21H41NO3' rt=440.74 max_intensity=832134.09


# Create Multiple Datasets

In [10]:
min_ms1_intensity = 2E5
rt_range = [(3*60, 21*60)]
mz_range = [(0, 1000)]
set_log_level_debug()
dataset = chemicals.sample(cc, mz_range, rt_range, min_ms1_intensity, 200, 2, "Known", None, hmdb, 0.1)

DEBUG:ChemicalCreator:200 ms1 peaks to be created.
DEBUG:ChemicalCreator:i = 0
DEBUG:ChemicalCreator:i = 25
DEBUG:ChemicalCreator:i = 50
DEBUG:ChemicalCreator:i = 75
DEBUG:ChemicalCreator:i = 100
DEBUG:ChemicalCreator:i = 125
DEBUG:ChemicalCreator:i = 150
DEBUG:ChemicalCreator:i = 175


In [12]:
n_samples = [5,5,5]
classes = ["class1","class2","class3"]
intensity_noise_sd = [1000]
change_probabilities = [0.2,0.2]
change_differences_means = [10000,20000]
change_differences_sds = [0,0] # 0 means fixed difference between class and class1
dropout_probabilities = [0.2,0.2,0.2]
experimental_classes = None # [["male","female"],["Positive","Negative","Unknown"]]
experimental_probabilitities = None # [[0.5,0.5],[0.33,0.33,0.34]]
experimental_sds = None # [[250],[250]]

multiple_samples = MultiSampleCreator(dataset, n_samples, classes, intensity_noise_sd, 
                                      change_probabilities, change_differences_means, change_differences_sds, dropout_probabilities,
                                     experimental_classes, experimental_probabilitities, experimental_sds)

DEBUG:MultiSampleCreator:Classes, Statuses and Differences defined.
DEBUG:MultiSampleCreator:Dataset 1 of 15 created.
DEBUG:MultiSampleCreator:Dataset 2 of 15 created.
DEBUG:MultiSampleCreator:Dataset 3 of 15 created.
DEBUG:MultiSampleCreator:Dataset 4 of 15 created.
DEBUG:MultiSampleCreator:Dataset 5 of 15 created.
DEBUG:MultiSampleCreator:Dataset 6 of 15 created.
DEBUG:MultiSampleCreator:Dataset 7 of 15 created.
DEBUG:MultiSampleCreator:Dataset 8 of 15 created.
DEBUG:MultiSampleCreator:Dataset 9 of 15 created.
DEBUG:MultiSampleCreator:Dataset 10 of 15 created.
DEBUG:MultiSampleCreator:Dataset 11 of 15 created.
DEBUG:MultiSampleCreator:Dataset 12 of 15 created.
DEBUG:MultiSampleCreator:Dataset 13 of 15 created.
DEBUG:MultiSampleCreator:Dataset 14 of 15 created.
DEBUG:MultiSampleCreator:Dataset 15 of 15 created.


In [13]:
print(multiple_samples.chemical_statuses[0][0:3])
print(multiple_samples.chemical_statuses[1][0:3])
print(multiple_samples.chemical_statuses[2][0:3])

print( )

print(multiple_samples.samples[0][0])
print(multiple_samples.samples[5][0])
print(multiple_samples.samples[10][0])


['missing' 'unchanged' 'missing']
['changed' 'unchanged' 'missing']
['unchanged' 'changed' 'unchanged']

KnownChemical - 'C16H10O6' rt=677.99 max_intensity=599590.29
KnownChemical - 'C21H41NO3' rt=440.74 max_intensity=844200.14
KnownChemical - 'C21H41NO3' rt=440.74 max_intensity=832027.17


In [14]:
i = 2
for j in range(len(multiple_samples.samples)):
    if j>0 and multiple_samples.sample_classes[j] != multiple_samples.sample_classes[j-1]:
        print( )
    if multiple_samples.sample_classes[j] == multiple_samples.classes[0]:
        print(multiple_samples.samples[j][i].max_intensity, multiple_samples.sample_classes[j])
    else:
        print(multiple_samples.samples[j][i].max_intensity, multiple_samples.sample_classes[j],
              multiple_samples.chemical_statuses[np.where(np.array(multiple_samples.classes)==multiple_samples.sample_classes[j])[0][0]-1][i],
             multiple_samples.chemical_differences_from_class1[np.where(np.array(multiple_samples.classes)==multiple_samples.sample_classes[j])[0][0]][i])

199810.66612140546 class1
199099.58348289476 class1
199690.4869862474 class1
200458.89942748673 class1
199403.47481122456 class1

201991.84376336203 class2 missing 0
199396.63283881344 class2 missing 0
200045.12749024702 class2 missing 0
199157.52528710864 class2 missing 0
201517.48651029007 class2 missing 0

416598.1807030687 class3 missing 0
414090.69737828185 class3 missing 0
414020.3759501198 class3 missing 0
414691.90588332113 class3 missing 0
414079.23433044663 class3 missing 0


In [15]:
max_rt = 10*60                  # the maximum retention time of scans to generate
N = 1                           # top-5 DDA fragmentation
isolation_window = 0.5          # the isolation window in Dalton around a selected precursor ion
rt_tol = 15                     # the rt window around a selected precursor ion to prevent it from fragmented multiple times
min_ms1_intensity = 2E5         # the minimum ms1 peak intensity
multi_dataset = multiple_samples.samples[0]

mass_spec = IndependentMassSpectrometer(POSITIVE, multi_dataset, density=ps.density_estimator)
controller = TopNController(mass_spec, N, isolation_window, rt_tol, min_ms1_intensity)

set_log_level_warning() # We don't want to see too many messages as the controller is running
# set_log_level_info()
# set_log_level_debug()

controller.run(0, max_rt)

600.0685579180773it [00:07, 82.61it/s]                                                                                 


# To Do

- Convolutional neural networks to identify ROI
    - pad so all same length
    - link ROI to final peaks to see how many peaks there are
    - predict how many peaks in a ROI

- Talk to Stefan about how basic strategies in the MS would work
    - how does interact with exclusion window
    - what does it do if we see no peaks