In [1]:
import sys
sys.path.append('../codes')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%matplotlib inline

In [4]:
import numpy as np
import pandas as pd
import pylab as plt

from VMSfunctions.Common import *
from VMSfunctions.DataGenerator import *
from VMSfunctions.Chromatograms import *

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, confusion_matrix, accuracy_score, precision_score
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.svm import SVC 

In [6]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from keras.optimizers import SGD

Using TensorFlow backend.


In [7]:
import GPy

In [8]:
set_log_level_debug()

In [9]:
base_dir = 'C:\\Users\\joewa\\University of Glasgow\\Vinny Davies - CLDS Metabolomics Project\\'
base_dir = 'C:\\Users\\Vinny\\OneDrive - University of Glasgow\\CLDS Metabolomics Project\\'

# Functions

In [10]:
def one_hot_encode_object_array(arr):
    '''One hot encode a numpy array of objects (e.g. strings)'''
    uniques, ids = np.unique(arr, return_inverse=True)
    return np_utils.to_categorical(ids, len(uniques))

In [11]:
class CreatePeaksDataset(object):
    def __init__(self, DataSource, min_size, proportion_train):
        self.min_size = min_size
        mzs = []
        rts = []
        intensities = []
        statuses = []
        dict_keys = list(DataSource.all_rois.keys())
        for i in range(len(dict_keys)):
            for j in range(len(DataSource.all_rois[dict_keys[i]]['rois'])):
                new_roi = DataSource.all_rois[dict_keys[i]]['rois'][j]
                num_scans = len(new_roi.mzs())
                if num_scans >= min_size:
                    mzs.append(new_roi.mzs().tolist()[0:min_size])
                    intensities.append(new_roi.intensities().tolist()[0:min_size])
                    rts.append(new_roi.rts().tolist()[0:min_size])
                    statuses.append(new_roi.pickedPeak)
        mzs = np.array(mzs)
        intensities = np.array(intensities)
        rts = np.array(rts)
        max_intensities = intensities.max(axis=1)
        norm_intensities = intensities / max_intensities[:,None]
        self.train_mzs, self.test_mzs, self.train_rts, self.test_rts, self.train_intensities, self.test_intensities, self.train_norm_intensities, self.test_norm_intensities, self.train_max_intensities, self.test_max_intensities, self.train_statuses, self.test_statuses = train_test_split(mzs, rts, intensities, norm_intensities, max_intensities, statuses, train_size=proportion_train, test_size=1-proportion_train, random_state=0)

# Load Data

In [13]:
mzml_path = base_dir + 'Data\\multibeers_urine_data\\beers\\fullscan'
xcms_roi_file = base_dir + 'Data\\multibeers_urine_data\\beers\\fullscan\\rois.csv'
extracted_roi_file = base_dir + 'Trained Models\\all_rois.p'

In [14]:
ds = DataSource()

In [15]:
ds.load_data(mzml_path)

INFO:DataSource:Loading Beer_multibeers_10_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_10_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_11_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_11_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_12_fullscan1.mzML
INFO:DataSource:Loading Beer_multibeers_13_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_12_fullscan1.mzML
C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_13_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_14_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_14_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_15_fullscan1.mzML
INFO:DataSource:Loading Beer_multibeers_16_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_15_fullscan1.mzML
C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_16_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_17_fullscan1.mzML
INFO:DataSource:Loading Beer_multibeers_18_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_17_fullscan1.mzML
C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_18_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_19_fullscan1.mzML
INFO:DataSource:Loading Beer_multibeers_1_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_19_fullscan1.mzML
C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_1_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_2_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_2_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_3_fullscan1.mzML
INFO:DataSource:Loading Beer_multibeers_4_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_3_fullscan1.mzML
C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_4_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_5_fullscan1.mzML
INFO:DataSource:Loading Beer_multibeers_6_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_5_fullscan1.mzML
C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_6_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_7_fullscan1.mzML
INFO:DataSource:Loading Beer_multibeers_8_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_7_fullscan1.mzML
C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_8_fullscan1.mzML


INFO:DataSource:Loading Beer_multibeers_9_fullscan1.mzML


C:\Users\Vinny\OneDrive - University of Glasgow\CLDS Metabolomics Project\Data\multibeers_urine_data\beers\fullscan\Beer_multibeers_9_fullscan1.mzML


In [16]:
ds.extract_roi(xcms_roi_file)

INFO:DataSource:Creating ROI objects for Beer_multibeers_1_fullscan1.mzML
DEBUG:DataSource:     0/ 31525
DEBUG:DataSource: 10000/ 31525
DEBUG:DataSource: 20000/ 31525
DEBUG:DataSource: 30000/ 31525
INFO:DataSource:Extracted 31525 ROIs for Beer_multibeers_1_fullscan1.mzML
INFO:DataSource:Creating ROI objects for Beer_multibeers_10_fullscan1.mzML
DEBUG:DataSource:  8475/ 33161
DEBUG:DataSource: 18475/ 33161
DEBUG:DataSource: 28475/ 33161
INFO:DataSource:Extracted 33161 ROIs for Beer_multibeers_10_fullscan1.mzML
INFO:DataSource:Creating ROI objects for Beer_multibeers_11_fullscan1.mzML
DEBUG:DataSource:  5314/ 31424
DEBUG:DataSource: 15314/ 31424
DEBUG:DataSource: 25314/ 31424
INFO:DataSource:Extracted 31424 ROIs for Beer_multibeers_11_fullscan1.mzML
INFO:DataSource:Creating ROI objects for Beer_multibeers_12_fullscan1.mzML
DEBUG:DataSource:  3890/ 28152
DEBUG:DataSource: 13890/ 28152
DEBUG:DataSource: 23890/ 28152
INFO:DataSource:Extracted 28152 ROIs for Beer_multibeers_12_fullscan1.mzML

In [17]:
ds.populate_roi()

INFO:DataSource:Populating ROI objects for Beer_multibeers_1_fullscan1.mzML
DEBUG:DataSource:     0/  1109 processing spectrum <__main__.Spectrum object with native ID 1 at 0x113fb799e48>
DEBUG:DataSource:   100/  1109 processing spectrum <__main__.Spectrum object with native ID 201 at 0x113fb9ff4a8>
DEBUG:DataSource:   200/  1109 processing spectrum <__main__.Spectrum object with native ID 401 at 0x113fbc45ba8>
DEBUG:DataSource:   300/  1109 processing spectrum <__main__.Spectrum object with native ID 601 at 0x113fbe9cc50>
DEBUG:DataSource:   400/  1109 processing spectrum <__main__.Spectrum object with native ID 801 at 0x113fc0f0d68>
DEBUG:DataSource:   500/  1109 processing spectrum <__main__.Spectrum object with native ID 1001 at 0x113fc343ef0>
DEBUG:DataSource:   600/  1109 processing spectrum <__main__.Spectrum object with native ID 1201 at 0x113fc59ae80>
DEBUG:DataSource:   700/  1109 processing spectrum <__main__.Spectrum object with native ID 1401 at 0x113fc7eff98>
DEBUG:DataS

DEBUG:DataSource:  1000/  1113 processing spectrum <__main__.Spectrum object with native ID 2001 at 0x113eeeebb38>
DEBUG:DataSource:  1100/  1113 processing spectrum <__main__.Spectrum object with native ID 2201 at 0x113ef140da0>
INFO:DataSource:Populating ROI objects for Beer_multibeers_15_fullscan1.mzML
DEBUG:DataSource:     0/  1121 processing spectrum <__main__.Spectrum object with native ID 1 at 0x113ef24f978>
DEBUG:DataSource:   100/  1121 processing spectrum <__main__.Spectrum object with native ID 201 at 0x113ef4a3860>
DEBUG:DataSource:   200/  1121 processing spectrum <__main__.Spectrum object with native ID 401 at 0x113f06c7908>
DEBUG:DataSource:   300/  1121 processing spectrum <__main__.Spectrum object with native ID 601 at 0x113f091c860>
DEBUG:DataSource:   400/  1121 processing spectrum <__main__.Spectrum object with native ID 801 at 0x113f0b739e8>
DEBUG:DataSource:   500/  1121 processing spectrum <__main__.Spectrum object with native ID 1001 at 0x113f0dc9b70>
DEBUG:Data

DEBUG:DataSource:   600/  1101 processing spectrum <__main__.Spectrum object with native ID 1201 at 0x113ff01f5c0>
DEBUG:DataSource:   700/  1101 processing spectrum <__main__.Spectrum object with native ID 1401 at 0x113ff2687b8>
DEBUG:DataSource:   800/  1101 processing spectrum <__main__.Spectrum object with native ID 1601 at 0x113ff4bd978>
DEBUG:DataSource:   900/  1101 processing spectrum <__main__.Spectrum object with native ID 1801 at 0x113ff712a58>
DEBUG:DataSource:  1000/  1101 processing spectrum <__main__.Spectrum object with native ID 2001 at 0x113ff967b00>
DEBUG:DataSource:  1100/  1101 processing spectrum <__main__.Spectrum object with native ID 2201 at 0x113ffbbc358>
INFO:DataSource:Populating ROI objects for Beer_multibeers_3_fullscan1.mzML
DEBUG:DataSource:     0/  1119 processing spectrum <__main__.Spectrum object with native ID 1 at 0x113ffc826a0>
DEBUG:DataSource:   100/  1119 processing spectrum <__main__.Spectrum object with native ID 201 at 0x113ffed7be0>
DEBUG:Da

DEBUG:DataSource:   400/  1087 processing spectrum <__main__.Spectrum object with native ID 801 at 0x1138ba7a128>
DEBUG:DataSource:   500/  1087 processing spectrum <__main__.Spectrum object with native ID 1001 at 0x1138bcd3278>
DEBUG:DataSource:   600/  1087 processing spectrum <__main__.Spectrum object with native ID 1201 at 0x1138bf1c4e0>
DEBUG:DataSource:   700/  1087 processing spectrum <__main__.Spectrum object with native ID 1401 at 0x1138d13f748>
DEBUG:DataSource:   800/  1087 processing spectrum <__main__.Spectrum object with native ID 1601 at 0x1138d396be0>
DEBUG:DataSource:   900/  1087 processing spectrum <__main__.Spectrum object with native ID 1801 at 0x1138d5ead30>
DEBUG:DataSource:  1000/  1087 processing spectrum <__main__.Spectrum object with native ID 2001 at 0x1138d841940>
INFO:DataSource:Populating ROI objects for Beer_multibeers_9_fullscan1.mzML
DEBUG:DataSource:     0/  1108 processing spectrum <__main__.Spectrum object with native ID 1 at 0x1138db01eb8>
DEBUG:Da

# Logistic Regression

In [36]:
data8 = CreatePeaksDataset(ds, 8, 0.9)

In [45]:
train8 = np.array([np.log(a + b) for a, b in zip([[i] for i in data8.train_max_intensities.tolist()], data8.train_norm_intensities.tolist())])
test8 = [np.log(a + b) for a, b in zip([[i] for i in data8.test_max_intensities.tolist()], data8.test_norm_intensities.tolist())]

In [49]:
lr8 = LogisticRegression()
lr8.fit(train8, data8.train_statuses)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

Make predictions and print results

In [51]:
lr_predict8 = lr8.predict(test8)

In [53]:
print(np.array(confusion_matrix(data8.test_statuses, lr_predict8)).transpose())
print( )

[[17041  3506]
 [ 1835  6013]]



use CNN on data padded to max length. test how it does on truncated peaks