In [3]:
import numpy as np
import sys
import scipy.stats
import re

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
sys.path.append('../codes')

In [6]:
%pprint

Pretty printing has been turned OFF


In [27]:
from VMSfunctions.Common import *
from VMSfunctions.model import *

# Chemicals

In [28]:
# read this in? https://www.sisweb.com/referenc/source/exactmas.htm

def ElementCharacterisitics(object):
    self.mz_diff = 1.0033548378
    self.carbon_proportions = [0.989, 0.011]   # 12, 13
    self.oxygen_proportions = [0.9976, 0.0004, 0.002] # 16, 17, 18

In [29]:
class Formula(object):
    def __init__(self, formula_string, mz):
        self.formula_string = formula_string
        self.mz = mz # calculate this later
        
    def _get_mz(self):
        return self.mz
    
    def _get_n_element(self, element):
        if self.formula_string == element:
            return 1
        split_formula = self.formula_string.split(element)
        if(len(split_formula)==1):
            return 0
        for i in range(len(split_formula)):
            if split_formula[i+1][0].isdigit():
                return float(re.split('[A-Z]+',split_formula[i+1])[0])
            else:
                if split_formula[i+1][0].islower():
                    pass
                else:
                    return 1
        return 0

In [30]:
class Isotopes(object):
    def __init__(self, formula):
        self.formula = formula
        self.C12_proportion = 0.989
        self.mz_diff = 1.0033548378
        
    def get_isotopes(self, total_proportion):
        # update this to work properly
        peaks = [() for i in range(len(self._get_isotope_proportions(total_proportion)))]
        for i in range(len(peaks)):
            peaks[i] += (self._get_isotope_mz(self._get_isotope_names(i)),)
            peaks[i] += (self._get_isotope_proportions(total_proportion)[i],)
            peaks[i] += (self._get_isotope_names(i),)
        return peaks
    # outputs [(mz_1, intensity_proportion_1, isotope_name_1),...,(mz_n, intensity_proportion_n, isotope_name_n)]
    
    def _get_isotope_proportions(self, total_proportion):
        proportions = [] 
        while sum(proportions) < total_proportion:
            proportions.extend([scipy.stats.binom.pmf(len(proportions),self.formula._get_n_element("C"),1-self.C12_proportion)])        
        normalised_proportions = [proportions[i]/sum(proportions) for i in range(len(proportions))]
        return normalised_proportions
    
    def _get_isotope_names(self, isotope_number):
        if isotope_number == 0:
            return "Mono"
        else:
            return str(isotope_number) + "C13"
    
    def _get_isotope_mz(self, isotope):
        if isotope == "Mono":
            return self.formula._get_mz()
        elif isotope[-3:] == "C13":
            return self.formula._get_mz() - float(isotope.split("C13")[0]) * self.mz_diff
        else:
            return None
            # turn this into a proper function

In [81]:
class Aducts(object):
    def __init__(self, formula):
        self.aduct_names = ["M+H", "[M+ACN]+H", "[M+CH3OH]+H", "[M+NH3]+H"] # remove eventually
        self.formula = formula
        
    def get_aducts(self):
        aducts = []
        proportions = self._get_aduct_proportions()
        for j in range(len(self.aduct_names)):
            if proportions[j] != 0:
                aducts.extend([(self._get_aduct_names()[j], proportions[j])])
        return aducts
    
    def _get_aduct_proportions(self):
        # replace this with something proper
        proportions = np.random.binomial(1,0.1,3) * np.random.uniform(0.1,0.2,3)
        proportions = [1-sum(proportions)] + proportions.tolist()
        return proportions
    
    def _get_aduct_names(self):
        return self.aduct_names

In [82]:
# requires you to run stuff below to test
chrom = FunctionalChromatogram("normal", [0,1])
formula = Formula("C22HgFS",100)
isotopes = Isotopes(formula)
aducts = Aducts(formula)
chem = KnownChemical(formula,isotopes,aducts,100, 10000, chrom, None)
print(chem.formula.formula_string)
print(chem.formula._get_n_element("C"))
print(chem.isotopes)
print(chem.aducts)

C22HgFS
22.0
[(100, 0.7853807179964822, 'Mono'), (98.9966451622, 0.19217607053099056, '1C13'), (97.9932903244, 0.022443211472527185, '2C13')]
[('M+H', 1.0)]


In [83]:
def aductTransformation(mz, aduct):
    if aduct == "M+H":
        return (mz - 1.007276)
    elif aduct == "[M+ACN]+H":
        return (mz - 42.03383)
    elif aduct == "[M+CH3OH]+H":
        return (mz - 33.03349)
    elif aduct == "[M+NH3]+H":
        return (mz - 18.03383)    
    else:
        return None
    # turn this into a proper function

In [84]:
class Chemical(object):
    
    def __repr__(self):
        raise NotImplementedError()
        
    def get_all_mz_peaks(self, query_rt, ms_level, isolation_windows):
        if ms_level == 1:
            if not self._rt_match(query_rt):
                return None
        mz_peaks = []
        for which_isotope in range(len(self.isotopes)):
            for which_aduct in range(len(self._get_aducts()[which_isotope])):
                mz_peaks.extend(self._get_mz_peaks(query_rt, ms_level, isolation_windows, which_isotope, which_aduct))
        if mz_peaks == []:
            return None
        else:
            return mz_peaks

    def _get_mz_peaks(self, query_rt, ms_level, isolation_windows, which_isotope, which_aduct):
        mz_peaks = []
        if ms_level ==1 and self.ms_level == 1:
            if self._isolation_match(query_rt, isolation_windows[0], which_isotope, which_aduct): # check just first set of windows
                intensity = self._get_intensity(query_rt, which_isotope, which_aduct)
                mz = self._get_mz(query_rt, which_isotope, which_aduct)
                mz_peaks.extend([(mz, intensity)])
        elif ms_level > 1 and which_isotope > 0:
            pass
        elif ms_level == self.ms_level:
            intensity = self._get_intensity(query_rt, which_isotope, which_aduct)
            mz = self._get_mz(query_rt, which_isotope, which_aduct)
            return [(mz, intensity)]
        else:
            if self._isolation_match(query_rt, isolation_windows[self.ms_level-1], which_isotope, which_aduct) and self.children != None:
                for i in range(len(self.children)):
                    mz_peaks.extend(self.children[i]._get_mz_peaks(query_rt, ms_level, isolation_windows, which_isotope, which_aduct))
            else:
                return []
        return mz_peaks
        
    def _get_aducts(self):
        if self.ms_level == 1:
            return self.aducts
        else:
            return self.parent._get_aducts()
        
    def _rt_match(self, query_rt):
        if self.ms_level == 1:
            if self.chromatogram._rt_match(query_rt - self.rt) == True:
                return True
            else:
                return False
        else:
            True
        
    def _get_intensity(self, query_rt, which_isotope, which_aduct):
        if self.ms_level == 1:
            intensity = self.isotopes[which_isotope][1] * self._get_aducts()[which_isotope][which_aduct][1] * self.max_intensity
            return (intensity * self.chromatogram.get_relative_intensity(query_rt - self.rt))
        else:
            return (self.parent._get_intensity(query_rt, which_isotope, which_aduct) * self.parent_mass_prop)

    def _get_mz(self, query_rt, which_isotope, which_aduct):
        if self.ms_level == 1:
            return (aductTransformation(self.isotopes[which_isotope][0], self._get_aducts()[which_isotope][which_aduct][0]) + self.chromatogram.get_relative_mz(query_rt - self.rt))
        else:
            return (aductTransformation(self.isotopes[which_isotope][0], self._get_aducts()[which_isotope][which_aduct][0]))
            
    def _isolation_match(self, query_rt, isolation_windows, which_isotope, which_aduct):                        
        # assumes list is formated like:
        # [(min_1,max_1),(min_2,max_2),...]
        for window in isolation_windows:
            if (self._get_mz(query_rt, which_isotope, which_aduct) > window[0] and self._get_mz(query_rt, which_isotope, which_aduct) <= window[1]):
                return True
        return False

In [85]:
class UnknownChemical(Chemical):
    """
    Chemical from an unknown chemical formula
    """
    def __init__(self, mz, rt, max_intensity, chromatogram, children = None):
        self.max_intensity = max_intensity
        self.isotopes = [(mz, 1, "Mono")] # [(mz, intensity_proportion, isotope,name)]
        self.aducts = [[("M+H",1)]]
        self.rt = rt
        self.chromatogram = chromatogram
        self.children = children
        self.ms_level = 1
        
    def __repr__(self):
         return 'UnknownChemical mz=%.4f rt=%.2f max_intensity=%.2f' % (self.isotopes[0][0], self.rt, self.isotopes[0][1])

In [86]:
class KnownChemical(Chemical):
    """
    Chemical from an known chemical formula
    """
    def __init__(self, formula, isotopes, aducts, rt, max_intensity, chromatogram, children = None, total_proportion = 0.99):
        self.formula = formula
        self.isotopes = isotopes.get_isotopes(total_proportion)
        self.aducts = aducts.get_aducts()
        self.rt = rt
        self.max_intensity = max_intensity
        self.chromatogram = chromatogram
        self.children = children
        self.ms_level = 1
    
    def __repr__(self):
         return 'KnownChemical - %r' % (self.formula.formula_string)

In [87]:
class MSN(Chemical):
    """
    ms2+ fragments
    """
    def __init__(self, mz, ms_level, parent_mass_prop, children=None, parent= None):
        self.isotopes = [(mz,None,"MSN")]
        self.ms_level = ms_level
        self.parent_mass_prop = parent_mass_prop
        self.children = children
        self.parent = parent
        
    def __repr__(self):
         return 'MSN Fragment mz=%.4f ms_level=%d' % (self.isotopes[0][0], self.ms_level)

In [88]:
chrom = FunctionalChromatogram("normal", [0,1])
frag1 = MSN(50,2,0.5)
frag2 = MSN(55,2,0.7)
chem = UnknownChemical(100, 100, 10000, chrom, [frag1, frag2])
frag1.parent = chem
frag2.parent = chem

frag1_1 = MSN(30,3,0.3,None,frag1)
frag1_2 = MSN(35,3,0.5,None,frag1)
frag1.children = [frag1_1, frag1_2]

In [89]:
chem.get_all_mz_peaks(101,1,[[(0,300)]])

[(98.992724, 1164.2509265114713)]

In [90]:
chem.get_all_mz_peaks(101,2,[[(0,300)]])

[(48.992724, 582.1254632557357), (53.992724, 814.9756485580299)]

In [91]:
chem.get_all_mz_peaks(101,3,[[(0,300)],[(0,300)]])

[(28.992724, 174.6376389767207), (33.992724, 291.06273162786783)]

In [92]:
# this doesnt work - will fix shortly
chrom = FunctionalChromatogram("normal", [0,1])
frag1 = MSN(50,2,0.5)
frag2 = MSN(55,2,0.7)
chrom = FunctionalChromatogram("normal", [0,1])
formula = Formula("C22HgFS",100)
isotopes = Isotopes(formula)
aducts = Aducts(formula)
chem2 = KnownChemical(formula,isotopes,aducts,100, 10000, chrom, None)
frag1.parent = chem2
frag2.parent = chem2

frag1_1 = MSN(30,3,0.3,None,frag1)
frag1_2 = MSN(35,3,0.5,None,frag1)
frag1.children = [frag1_1, frag1_2]

In [93]:
chem2.get_all_mz_peaks(101,1,[[(0,300)]])

TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'

In [323]:
chem2.get_all_mz_peaks(101,2,[[(0,300)]])

[(51, 419.1303335441297), (56, 586.7824669617816), (100, 46.57003706045886), (105, 65.1980518846424)]

In [324]:
chem2.get_all_mz_peaks(101,3,[[(0,300)],[(0,300)]])

[(31, 125.73910006323891), (36, 209.56516677206486), (80, 13.971011118137657), (85, 23.28501853022943)]

# Data Generator

In [None]:
class ChemicalCreater(object):
    def __init__(self, density_estimator, chemical_type = None, n_peaks = None, formula_list = None):
        self.density_estimator = density_estimator
        self.chemical_type = chemical_type
        self.n_peaks = n_peaks
        self.formula_list = formula_list
        
    def sample(self, ms_level, chemical_type = None):
        chemicals = []
        
        return chemicals

In [None]:
# this samples the dataset using ChemicalCreater
class Sample_Dataset(object):























