In [1]:
import numpy as np
import sys
import scipy.stats
import re

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
sys.path.append('../codes')

In [4]:
%pprint

Pretty printing has been turned OFF


In [5]:
from VMSfunctions.Common import *
from VMSfunctions.model import *

# Functional Example

In [6]:
chrom = FunctionalChromatogram("normal", [0,1])
frag1 = MSN(50,2,0.5)
frag2 = MSN(55,2,0.7)
chem = UnknownChemical(100, 100, 10000, chrom, [frag1, frag2])
frag1.parent = chem
frag2.parent = chem

frag1_1 = MSN(30,3,0.3,None,frag1)
frag1_2 = MSN(35,3,0.5,None,frag1)
frag1.children = [frag1_1, frag1_2]

In [7]:
print(chem.isotopes)
print(chem.adducts)

[(100, 1, 'Mono')]
[('M+H', 1)]


In [8]:
chem.get_all_mz_peaks(101,1,[[(0,300)]])

[(98.992724, 1164.2509265114713)]

In [9]:
chem.get_all_mz_peaks(101,2,[[(0,300)]])

[(48.992724, 582.1254632557357), (53.992724, 814.9756485580299)]

In [10]:
chem.get_all_mz_peaks(101,3,[[(0,300)],[(0,300)]])

[(28.992724, 174.6376389767207), (33.992724, 291.06273162786783)]

# Emprical Example

In [11]:
chrom = FunctionalChromatogram("normal", [0,1])
frag1 = MSN(50,2,0.5)
frag2 = MSN(55,2,0.7)
chrom = FunctionalChromatogram("normal", [0,1])
formula = Formula("C12HgFS",200)
isotopes = Isotopes(formula)
adducts = Adducts(formula)
chem2 = KnownChemical(formula,isotopes,adducts,100, 10000, chrom, [frag1, frag2])
frag1.parent = chem2
frag2.parent = chem2

frag1_1 = MSN(30,3,0.3,None,frag1)
frag1_2 = MSN(35,3,0.5,None,frag1)
frag1.children = [frag1_1, frag1_2]

In [12]:
print(chem2.adducts)
print(chem2.isotopes)

[('M+H', 0.8832362138891587), ('[M+NH3]+H', 0.11676378611084126)]
[(200, 0.8822479928635149, 'Mono'), (198.9966451622, 0.11775200713648519, '1C13')]


In [13]:
chem2.get_all_mz_peaks(101,1,[[(0,300)]])

[(198.992724, 907.2231810571801), (181.96617, 119.93486204705293), (197.9893691622, 121.085399291757)]

In [14]:
chem2.get_all_mz_peaks(101,2,[[(0,300)]])

[(48.992724, 453.61159052859006), (53.992724, 635.0562267400261), (31.96617, 59.967431023526466), (36.966170000000005, 83.95440343293704)]

In [15]:
chem2.get_all_mz_peaks(101,3,[[(0,300)],[(0,300)]])

[(28.992724, 136.083477158577), (33.992724, 226.80579526429503), (11.966170000000002, 17.990229307057938), (16.96617, 29.983715511763233)]

# Data Generator

In [62]:
class ChemicalCreator(object):
    def __init__(self, peak_sampler, xcms_output = None):
        self.peak_sampler = peak_sampler
        self.xcms_output = xcms_output
        if self.xcms_output != None:
            chromatograms = self._load_chromatograms(self.xcms_output)
        else:
            chromograms = None
            
    def sample(self,n_ms1_peaks=3000, ms_levels = 2, chemical_type = None, chromatogram_type = "Empirical", formula_list = None, use_chrom_tuple = False):
        self.n_ms1_peaks = n_ms1_peaks
        self.ms_levels = ms_levels
        self.chemical_type = chemical_type
        self.chromatogram_type = chromatogram_type
        self.formula_list = formula_list
        self.use_chrom_tuple = use_chrom_tuple
        if self.ms_levels > 2:
            print("Warning ms_level > 3 not implemented properly yet. Uses ms_level = 2 information for now")
        n_ms1_peaks = self._get_n(1)
        chemicals = []
        formula = None
        for i in range(self._get_n(1)):
            sampled_peak = self.peak_sampler.sample(ms_level=1, n_peaks=n_ms1_peaks)
            chrom = self._get_chromatogram(chromatograms)
            if self.chemical_type =="Known":
                formula = self.formula_list[i]
            chemicals.append(self._get_chemical(1, formula, chromatograms[i], sampled_peak[i]))
            chemicals[i].children = self._get_children(1, chemicals[i])
        return chemicals
    
    # needs to standardise children intensities, such that they add up to parent intensity times scalign factor
    
    # need to add CRP
    
    def _get_children(parent_ms_level, parent):
        children_ms_level = parent_ms_level + 1
        n_peaks = self._get_n(children_ms_level)
        if n_peaks == None:
            return None 
        elif children_ms_level == self.ms_levels:
            kids = []
            for index_children in range(n_peaks):
                kid = self._get_unknown_msn(children_ms_level, None, None, parent)
                kids.append(kid)
            return kids      
        elif children_ms_level < self.ms_levels:
            kids = []
            for index_children in range(n_peaks):
                kid = self._get_unknown_msn(children_ms_level, None, None, parent)
                kid._get_children(children_ms_level, kid)
                kids.append()
            return kids      
        else:
            return None
    
    def _get_n(self, ms_level):
        if ms_level == 1:
            return self.n_ms1_peaks
        elif ms_level ==2:
            return self.peak_sampler.density_estimator.n_peaks(2, 1) # not sure this will work
        else:
            return self.peak_sampler.density_estimator.n_peaks(2, 1)
 
    def _get_chemical(self, ms_level, formula, chromatogram, sampled_peak):
        if formula != None:
            return self._get_known_ms1(formula, chromatogram, sampled_peak)
        else:
            return self._get_unknown_msn(ms_level, chromatogram, sampled_peak)

    def _get_known_ms1(self, formula, chromatogram, sampled_peak):
        # eventually get rid of mz here
        mz = self._get_mz(1, chromatogram, sampled_peak)
        rt = self_get_rt(chromatogram, sampled_peak)
        intensity = self.get_intensity(chromatogram, sampled_peak)
        formula = Formula(formula,mz)
        isotopes = Isotopes(formula)
        adducts = Adducts(formula)
        return KnownChemical(formula, isotopes, adducts, rt, intensity, chrom, None)
    
    def _get_unknown_msn(self, ms_level, chromatogram, sampled_peak, parent = None):
        if ms_level == 1:
            mz = self._get_mz(1, chromatogram, sampled_peak)
            rt = self_get_rt(chromatogram, sampled_peak)
            intensity = self.get_intensity(chromatogram, sampled_peak)
            return UnknownChemical(mz, rt, intensity, chromatogram, None)
        else:
            mz = self._get_mz(ms_level, chromatogram, sampled_peak)
            parent_mass_prop = self._get_parent_prop(ms_level)
            return MSN(mz, ms_level, parent_mass_prop, None, parent)
            
    def _get_parent_prop(ms_level):
        return np.random.unform(0.2,0.8,1)
        # this needs to come from a density
    
    def _get_chromatogram(self, chromatograms):
        if chromatograms != None:
            selected = np.random.choice(len(chromatograms), 1)[0]
            return chromatograms[selected]
        else:
            NotImplementedError("Functional Chromatograms not implemented here yet")
    
    def _load_chromatograms(self, xcms_output):
        return self._load_xcms_df(xcms_output)
        
    def _get_mz(ms_level, chromatogram, sampled_peak):
        if chromatogram == None and sampled_peak == None:
            if ms_level == 2:
                return self.peak_sampler.sample(ms_level, 1).mz
            else:
                return self.peak_sampler.sample(2, 1).mz
        elif use_chrom_tuple == False:
            return sampled_peak.mz
        else:
            NotImplementedError()
            # extract same stuff from chromatogram
        
    def _get_rt(chromatogram, sampled_peak):
        if use_chrom_tuple == False:
            return sampled_peak.rt
        else:
            NotImplementedError()
            # extract same stuff from chromatogram

    def _get_intensity(chromatogram, sampled_peak):
        if use_chrom_tuple == False:
            return sampled_peak.intensity
        else:
            NotImplementedError()
            # extract same stuff from chromatogram
            
    def _load_xcms_df(self, df_file):
        """
        Load CSV file of chromatogram information exported by the XCMS script 'process_data.R'
        :param df_file: the input csv file exported by the script (in gzip format)
        :return: a list of Chromatogram objects
        """
        df = pd.read_csv(df_file, compression='gzip')
        peak_ids = df.id.unique()
        groups = df.groupby('id')
        chroms = []
        for i in range(len(peak_ids)):
            if i % 5000 == 0:
                print(i)
            pid = peak_ids[i]
            chrom = self._get_xcms_chromatograms(groups, pid)
            if chrom is not None:
                chroms.append(chrom)
        return chroms
    
    def _get_xcms_chromatograms(self, groups, pid):
        selected = groups.get_group(pid)
        rts = self._get_values(selected, 'rt_values')
        mzs = self._get_values(selected, 'mz_values')
        intensities = self._get_values(selected, 'intensity_values')
        assert len(rts) == len(mzs)
        assert len(rts) == len(intensities)
        if len(rts) > 1:
            chrom = EmpiricalChromatogram(rts, mzs, intensities)
        else:
            chrom = None
        return chrom
    
    def _get_values(self, df, column_name):
        return df[column_name].values

In [63]:
peak_sampler = load_obj('../models/peak_sampler_4_beers.p')
xcms_output = '../models/beer_ms1_peaks.csv.gz'

In [64]:
dataset = ChemicalCreator(peak_sampler, xcms_output)

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000


In [66]:
dataset.sample(1, 2, "Unknown","Empirical",None, False)

NameError: name 'chromatograms' is not defined