In [1]:
import numpy as np
import sys
import scipy.stats

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
sys.path.append('../codes')

In [4]:
%pprint

Pretty printing has been turned OFF


# Chromatogram

In [5]:
class Chromatogram(object):
    
    def get_relative_intensity(self, query_rt):
        raise NotImplementedError()
        
    def get_relative_mz(self, query_rt):
        raise NotImplementedError()
        
    def _rt_match(self, rt):
        raise NotImplementedError()

# Empirical Chromatogram

In [6]:
def chromatogramDensityNormalisation(rts, intensities):
    """
    Definition to standardise the area under a chromatogram to 1. Returns updated intensities
    """
    area = 0.0
    for rt_index in range(len(rts)-1):
        area += ((intensities[rt_index] + intensities[rt_index + 1])/2) / (rts[rt_index+1] - rts[rt_index])
    new_intensities = [x * (1 / area) for x in intensities]
    return new_intensities

In [7]:
chromatogramDensityNormalisation([2,3,4],[0.3,0.3,0.3])

[0.5, 0.5, 0.5]

In [8]:
class EmpiricalChromatogram(Chromatogram):
    """
    Empirical Chromatograms to be used within Chemicals
    """
    def __init__(self, rts, mzs, intensities):
        self.rts = [x - min(rts) for x in rts]
        self.mzs = [x - sum(mzs)/len(mzs) for x in rts] # may want to just set this to 0 and remove from input
        self.intensities = chromatogramDensityNormalisation(rts, intensities)

    def get_relative_intensity(self, query_rt):
        if self._rt_match(query_rt) == False:
            return None
        else:
            return((self.intensities[self._get_rt_neighbours_which(query_rt)[0]] + 
                    (self.intensities[self._get_rt_neighbours_which(query_rt)[1]]
                     -self.intensities[self._get_rt_neighbours_which(query_rt)[0]]) * self._get_distance(query_rt)))
        
    def get_relative_mz(self, query_rt):
        if self._rt_match(query_rt) == False:
            return None
        else:
            return((self.mzs[self._get_rt_neighbours_which(query_rt)[0]] + 
                    (self.mzs[self._get_rt_neighbours_which(query_rt)[1]]
                     -self.mzs[self._get_rt_neighbours_which(query_rt)[0]]) * self._get_distance(query_rt)))
        
    def _get_rt_neighbours(self, query_rt):
        rt_below = max(x for x in self.rts if x <= query_rt)
        rt_above = min(x for x in self.rts if x >= query_rt)
        return([rt_below, rt_above])
    
    def _get_rt_neighbours_which(self, query_rt):
        which_rt_below = self.rts.index(self._get_rt_neighbours(query_rt)[0])
        which_rt_above = self.rts.index(self._get_rt_neighbours(query_rt)[1])
        return([which_rt_below, which_rt_above])
        
    def _get_distance(self, query_rt):
        return((query_rt - self._get_rt_neighbours(query_rt)[0]) / 
               (self._get_rt_neighbours(query_rt)[0] - self._get_rt_neighbours(query_rt)[1]))
    
    def _rt_match(self, query_rt):
        if query_rt < min(self.rts) or query_rt > max(self.rts):
            return False
        else:
            return True

In [9]:
m = EmpiricalChromatogram([0,2,4,6,8],[1,2,3,4,5],[100,200,300,400,500])

In [10]:
print(m.get_relative_intensity(3))
print(m.get_relative_intensity(10))

0.25000000000000006
None


In [11]:
print(m.get_relative_mz(3))
print(m.get_relative_mz(10))

-2.0
None


# Functional Chromatogram

In [12]:
class FunctionalChromatogram(Chromatogram):
    """
    Functional Chromatograms to be used within Chemicals
    """
    def __init__(self, distribution, parameters, cutoff = 0.01):
        self.cutoff = cutoff
        self.mz = 0
        if distribution == "normal":
            self.distrib = scipy.stats.norm(parameters[0],parameters[1])
        elif distribution == "gamma":
            self.distrib = scipy.stats.gamma(parameters[0],parameters[1],parameters[2])
        elif distribution == "uniform":
            self.distrib = scipy.stats.uniform(parameters[0],parameters[1])
        else:
            raise NotImplementedError("distribution not implemented")
            
    def get_relative_intensity(self, query_rt):
        if self._rt_match(query_rt) == False:
            return None
        else:
            return(self.distrib.pdf(query_rt + self.distrib.ppf(self.cutoff/2)) * ( 1 / (1 - self.cutoff)))
        
    def get_relative_mz(self, query_rt):
        if self._rt_match(query_rt) == False:
            return None
        else:
            return self.mz

    def _rt_match(self, query_rt):
        if query_rt < 0 or query_rt > self.distrib.ppf(1-(self.cutoff/2)) - self.distrib.ppf(self.cutoff/2):
            return False
        else:
            return True

In [13]:
m = FunctionalChromatogram("normal",[0,1])

In [14]:
print(m.get_relative_intensity(0))
print(m.get_relative_intensity(6))

0.014605801037290308
None


In [15]:
print(m.get_relative_mz(0))
print(m.get_relative_mz(6))

0
None


# Chemicals

In [16]:
# read this in? https://www.sisweb.com/referenc/source/exactmas.htm

def ElementCharacterisitics(object):
    self.mz_diff = 1.0033548378
    self.carbon_proportions = [0.989, 0.011]   # 12, 13
    self.oxygen_proportions = [0.9976, 0.0004, 0.002] # 16, 17, 18

In [69]:
class Formula(object):
    def __init__(self, formula_string, mz):
        self.formula_string = formula_string
        self.mz = mz # calculate this
        
    def _get_mz(self):
        return self.mz
    
    def _get_n_element(self, element):
        return 1

In [70]:
class Isotopes(object):
    def __init__(self, formula, isotope_proportions, isotope_names):
        self.formula = formula
        self.isotope_proportions = isotope_proportions # remove eventually
        self.isotope_names = isotope_names # remove eventually
        self.mz_diff = 1.0033548378
        
    def get_isotopes(self):
        # update this to work properly
        peaks = []
        for i in range(len(self.isotope_names)):
            peaks.extend([(self._get_isotope_mz(self._get_isotope_names()[i]), self._get_isotope_proportions()[i], self._get_isotope_names()[i])])
        return peaks
    # outputs [(mz_1, intensity_proportion_1, isotope_name_1),...,(mz_n, intensity_proportion_n, isotope_name_n)]
    
    def _get_isotope_proportions(self):
        return self.isotope_proportions
    
    def _get_isotope_names(self):
        return self.isotope_names
    
    def _get_isotope_mz(self, isotope):
        if isotope == "Mono":
            return self.formula._get_mz()
        elif isotope == "1C13":
            return self.formula._get_mz() + self.mz_diff
        elif isotope == "2C13":
            self.formula.get_mz() + 2 * self.mz_diff
        else:
            return None
            # turn this into a proper function

In [71]:
class Aducts(object):
    def __init__(self, formula, aduct_names, aduct_proportions):
        self.aduct_names = aduct_names # remove eventually
        self.aduct_proportions = aduct_proportions # remove eventually
        self.formula = formula
        
    def get_aducts(self):
        aducts = []
        for j in range(len(self.aduct_names)):
            aducts.extend([(self._get_aduct_names()[j], self._get_aduct_proportions()[j])])
        return aducts
    
    def _get_aduct_proportions(self):
        return self.aduct_proportions
    
    def _get_aduct_names(self):
        return self.aduct_names

In [72]:
chrom = FunctionalChromatogram("normal", [0,1])
formula = Formula("hg",100)
isotopes = Isotopes(formula,[0.8,0.2],["Mono","1C13"])
aducts = Aducts(formula,["M+H","M+ACN"],[0.9,0.1])
chem = KnownChemical(formula,isotopes,aducts,100, 10000, chrom, None)
print(chem.formula)
print(chem.isotopes)
print(chem.aducts)

hg
[(100, 0.8, 'Mono'), (101.0033548378, 0.2, '1C13')]
[('M+H', 0.9), ('M+ACN', 0.1)]


In [18]:
def aductTransformation(mz, aduct):
    if aduct == "M+H":
        return (mz + 1)
    elif aduct == "M+ACN":
        return (mz + 50)
    else:
        return None
    # turn this into a proper function

In [42]:
class Chemical(object):
    
    def __repr__(self):
        raise NotImplementedError()
        
    def get_all_mz_peaks(self, query_rt, ms_level, isolation_windows):
        if ms_level == 1:
            if not self._rt_match(query_rt):
                return None
        mz_peaks = []
        for which_isotope in range(len(self.isotopes)):
            for which_aduct in range(len(self._get_aducts()[which_isotope])):
                mz_peaks.extend(self._get_mz_peaks(query_rt, ms_level, isolation_windows, which_isotope, which_aduct))
        if mz_peaks == []:
            return None
        else:
            return mz_peaks

    def _get_mz_peaks(self, query_rt, ms_level, isolation_windows, which_isotope, which_aduct):
        mz_peaks = []
        if ms_level ==1 and self.ms_level == 1:
            if self._isolation_match(query_rt, isolation_windows[0], which_isotope, which_aduct): # check just first set of windows
                intensity = self._get_intensity(query_rt, which_isotope, which_aduct)
                mz = self._get_mz(query_rt, which_isotope, which_aduct)
                mz_peaks.extend([(mz, intensity)])
        elif ms_level > 1 and which_isotope > 0:
            pass
        elif ms_level == self.ms_level:
            intensity = self._get_intensity(query_rt, which_isotope, which_aduct)
            mz = self._get_mz(query_rt, which_isotope, which_aduct)
            return [(mz, intensity)]
        else:
            if self._isolation_match(query_rt, isolation_windows[self.ms_level-1], which_isotope, which_aduct) and self.children != None:
                for i in range(len(self.children)):
                    mz_peaks.extend(self.children[i]._get_mz_peaks(query_rt, ms_level, isolation_windows, which_isotope, which_aduct))
            else:
                return []
        return mz_peaks
        
    def _get_aducts(self):
        if self.ms_level == 1:
            return self.aducts
        else:
            return self.parent._get_aducts()
        
    def _rt_match(self, query_rt):
        if self.ms_level == 1:
            if self.chromatogram._rt_match(query_rt - self.rt) == True:
                return True
            else:
                return False
        else:
            True
        
    def _get_intensity(self, query_rt, which_isotope, which_aduct):
        if self.ms_level == 1:
            intensity = self.isotopes[which_isotope][1] * self._get_aducts()[which_isotope][which_aduct][1] * self.max_intensity
            return (intensity * self.chromatogram.get_relative_intensity(query_rt - self.rt))
        else:
            return (self.parent._get_intensity(query_rt, which_isotope, which_aduct) * self.parent_mass_prop)

    def _get_mz(self, query_rt, which_isotope, which_aduct):
        if self.ms_level == 1:
            return (aductTransformation(self.isotopes[which_isotope][0], self._get_aducts()[which_isotope][which_aduct][0]) + self.chromatogram.get_relative_mz(query_rt - self.rt))
        else:
            return (aductTransformation(self.isotopes[which_isotope][0], self._get_aducts()[which_isotope][which_aduct][0]))
            
    def _isolation_match(self, query_rt, isolation_windows, which_isotope, which_aduct):                        
        # assumes list is formated like:
        # [(min_1,max_1),(min_2,max_2),...]
        for window in isolation_windows:
            if (self._get_mz(query_rt, which_isotope, which_aduct) > window[0] and self._get_mz(query_rt, which_isotope, which_aduct) <= window[1]):
                return True
        return False

In [43]:
class UnknownChemical(Chemical):
    """
    Chemical from an unknown chemical formula
    """
    def __init__(self, mz, rt, max_intensity, chromatogram, children):
        self.max_intensity = max_intensity
        self.isotopes = [(mz, 1, "Mono")] # [(mz, intensity_proportion, isotope,name)]
        self.aducts = [[("M+H",1)]]
        self.rt = rt
        self.chromatogram = chromatogram
        self.children = children
        self.ms_level = 1
        
    def __repr__(self):
         return 'UnknownChemical mz=%.4f rt=%.2f max_intensity=%.2f' % (self.isotopes[0][0], self.rt, self.isotopes[0][1])

In [44]:
class KnownChemical(Chemical):
    """
    Chemical from an known chemical formula
    """
    def __init__(self, formula, isotopes, aducts, rt, max_intensity, chromatogram, children):
        self.formula = formula.formula_string
        self.isotopes = isotopes.get_isotopes()
        self.aducts = aducts.get_aducts()
        self.rt = rt
        self.max_intensity = max_intensity
        self.chromatogram = chromatogram
        self.children = children
        self.ms_level = 1
    
    def __repr__(self):
         return 'KnownChemical - %r' % (self.formula)

In [45]:
class MSN(Chemical):
    """
    ms2+ fragments
    """
    def __init__(self, mz, ms_level, parent_mass_prop, children=None, parent= None):
        self.isotopes = [(mz,None,"MSN")]
        self.ms_level = ms_level
        self.parent_mass_prop = parent_mass_prop
        self.children = children
        self.parent = parent
        
    def __repr__(self):
         return 'MSN Fragment mz=%.4f ms_level=%d' % (self.isotopes[0][0], self.ms_level)

In [315]:
chrom = FunctionalChromatogram("normal", [0,1])
frag1 = MSN(50,2,0.5)
frag2 = MSN(55,2,0.7)
chem = UnknownChemical(100, 100, 10000, chrom, [frag1, frag2])
frag1.parent = chem
frag2.parent = chem

frag1_1 = MSN(30,3,0.3,None,frag1)
frag1_2 = MSN(35,3,0.5,None,frag1)
frag1.children = [frag1_1, frag1_2]

In [316]:
chem.get_all_mz_peaks(101,1,[[(0,300)]])

[(101, 1164.2509265114713)]

In [317]:
chem.get_all_mz_peaks(101,2,[[(0,300)]])

[(51, 582.1254632557357), (56, 814.9756485580299)]

In [319]:
chem.get_all_mz_peaks(101,3,[[(0,300)],[(0,300)]])

[(31, 174.6376389767207), (36, 291.06273162786783)]

In [320]:
chrom = FunctionalChromatogram("normal", [0,1])
frag1 = MSN(50,2,0.5)
frag2 = MSN(55,2,0.7)
f = Formula("hg",[100,101],[0.8,0.2],["Mono","1C13"],[["M+H","M+ACN"],["M+H"]],[[0.9,0.1],[1]])
chem2 = KnownChemical(f, 100, 10000, chrom, [frag1, frag2])
frag1.parent = chem2
frag2.parent = chem2

frag1_1 = MSN(30,3,0.3,None,frag1)
frag1_2 = MSN(35,3,0.5,None,frag1)
frag1.children = [frag1_1, frag1_2]

In [322]:
chem2.get_all_mz_peaks(101,1,[[(0,300)]])

[(101, 838.2606670882594), (150, 93.14007412091772), (102, 232.85018530229425)]

In [323]:
chem2.get_all_mz_peaks(101,2,[[(0,300)]])

[(51, 419.1303335441297), (56, 586.7824669617816), (100, 46.57003706045886), (105, 65.1980518846424)]

In [324]:
chem2.get_all_mz_peaks(101,3,[[(0,300)],[(0,300)]])

[(31, 125.73910006323891), (36, 209.56516677206486), (80, 13.971011118137657), (85, 23.28501853022943)]

# Data Generator

In [None]:
class ChemicalCreater(object):
    def __init__(self, density_estimator, chemical_type = None, n_peaks = None, formula_list = None):
        self.density_estimator = density_estimator
        self.chemical_type = chemical_type
        self.n_peaks = n_peaks
        self.formula_list = formula_list
        
    def sample(self, ms_level, chemical_type = None):
        chemicals = []
        
        return chemicals

In [None]:
# this samples the dataset using ChemicalCreater
class Sample_Dataset(object):























