Machine-Learning Pipeline for BME261L

In [1]:
#Imports
import warnings 
warnings.simplefilter('ignore')
import pandas as pd 
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt

#models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import AdaBoostClassifier as ABC
#other 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
%matplotlib inline

In [2]:
def downsample_data(data, labels):
    downData = None
    downLabels = None

    kfold = StratifiedKFold(n_splits = 100)
    for throwAway_index, subsample_index in kfold.split(data, labels):
        downData = data.iloc[subsample_index]
        downLabels = labels.iloc[subsample_index]
        break

    return downData, downLabels

In [3]:
#Read in CSV
#Names of files
heart_rate_path = '1360686_heartrate.csv'
motion_path = '1360686_acceleration.csv'
label_path = '1360686_labeled_sleep.csv'
#Read in as pandas dataframe
heart_rate = pd.read_csv(heart_rate_path, names=['time','heart_rate', 'time_30'])
motion = pd.read_csv(motion_path, names=['time','motion_x', 'motion_y', 'motion_z', 'time_30'])
labels = pd.read_csv(label_path, names=['time', 'label', 'time_30'])
#drop time 
heart_rate.drop('time', axis=1, inplace=True)
motion.drop('time', axis=1, inplace=True)
labels.drop('time', axis=1, inplace=True)
#preview data
print('HR:', heart_rate.shape)
print('Motion:', motion.shape)
print('Labels:', labels.shape)
#print(heart_rate.head(10))
#print(motion.head(10))
#print(labels.head(10))

#Merge data based on time_30. 
data = pd.merge(left=heart_rate, right=labels, on='time_30')
data = pd.merge(left=motion, right=data, on='time_30')
#data.to_csv('test_join.csv') #uncomment to write to csv
data.head(10)


HR: (3592, 2)
Motion: (931576, 4)
Labels: (633, 2)


Unnamed: 0,motion_x,motion_y,motion_z,time_30,heart_rate,label
0,0.019989,-0.338638,-0.943909,42,75,0
1,0.019989,-0.338638,-0.943909,42,74,0
2,0.019989,-0.338638,-0.943909,42,69,0
3,0.019989,-0.338638,-0.943909,42,69,0
4,0.019989,-0.338638,-0.943909,42,69,0
5,0.019989,-0.338638,-0.943909,42,69,0
6,0.019989,-0.338638,-0.943909,42,69,0
7,0.02179,-0.336166,-0.936584,42,75,0
8,0.02179,-0.336166,-0.936584,42,74,0
9,0.02179,-0.336166,-0.936584,42,69,0


In [4]:
#seperate features from label
data_X = data.iloc[:, 0:5]
data_Y = data.iloc[:, 5]
#Need to downsample data to save computation time
data_dsample, label_dsample = downsample_data(data_X, data_Y)
#drop time_30 since this is not a measured feature
data_dsample.drop('time_30', axis=1, inplace=True)

#downsample to 54,609 data points
print('New shape of data:', data_dsample.shape)

New shape of data: (53609, 4)


In [5]:
#KNN

#create a scaler
scaler = MinMaxScaler()

#create a KNN classifier
knn = KNeighborsClassifier()
#create a pipeline that does scaling, then KNN (prevent data leakage)
pipe = Pipeline(steps=[('scaler', scaler), ('knn', knn)])
#Set up parameters to fine tune
#check nearest neighbors 1-30
param_grid = {
    'knn__n_neighbors': list(range(1, 30))
}
#Pass the pipeline and the parameters into a GridSearchCV with a 5-fold CV
clf = GridSearchCV(pipe, param_grid, cv=5)
Y_pred = cross_val_predict(clf, data_dsample, label_dsample, cv=5)
#Report
report = classification_report(label_dsample, Y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6487
           1       1.00      1.00      1.00      1439
           2       1.00      1.00      1.00     22652
           3       1.00      1.00      1.00     14053
           5       1.00      1.00      1.00      8978

    accuracy                           1.00     53609
   macro avg       1.00      1.00      1.00     53609
weighted avg       1.00      1.00      1.00     53609



In [6]:
#Naive Bayes
#create a NB classifier
clf = GaussianNB()
#fit with a 10-fold CV
Y_pred = cross_val_predict(clf, data_dsample, label_dsample, cv=10)
#Report
report = classification_report(label_dsample, Y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6487
           1       1.00      0.99      1.00      1439
           2       1.00      1.00      1.00     22652
           3       1.00      1.00      1.00     14053
           5       1.00      1.00      1.00      8978

    accuracy                           1.00     53609
   macro avg       1.00      1.00      1.00     53609
weighted avg       1.00      1.00      1.00     53609



In [7]:
#SVM
#create a scalar
scalar = MinMaxScaler()
#create a SVM classifier
svm = SVC()
#create a pipeline that does scaling, then SVM
pipe = Pipeline(steps=[('scaler', scaler), ('svm', svm)])
#Set up parameters to fine tune
#tune for best kernel
param_grid = {
    'svm__kernel':['linear', 'rbf', 'poly', 'sigmoid']
    }
#grid search and CV
clf = GridSearchCV(pipe, param_grid, cv=5)
Y_pred = cross_val_predict(clf, data_dsample, label_dsample, cv=5)
#Report
report = classification_report(label_dsample, Y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.96      0.98      6487
           1       0.83      1.00      0.91      1439
           2       1.00      1.00      1.00     22652
           3       1.00      1.00      1.00     14053
           5       1.00      1.00      1.00      8978

    accuracy                           0.99     53609
   macro avg       0.97      0.99      0.98     53609
weighted avg       1.00      0.99      0.99     53609



In [8]:
#Neural Net
#create a scalar
scalar = MinMaxScaler()
#create a Nueral Net classifier
mlp = MLPClassifier()
#create a pipeline that does scaling, then SVM
pipe = Pipeline(steps=[('scaler', scaler), ('mlp', mlp)])
#Set up parameters to fine tune
#tune for best hidden layer size and activation
param_grid = {
    'mlp__hidden_layer_sizes': [(10,), (20,), (30,),
                                (40,), (50,), (60,)],
    'mlp__activation': ['logistic', 'tanh', 'relu']
    }
#grid search and CV
clf = GridSearchCV(pipe, param_grid, cv=5)
Y_pred = cross_val_predict(clf, data_dsample, label_dsample, cv=5)
#Report
report = classification_report(label_dsample, Y_pred)
print(report)



              precision    recall  f1-score   support

           0       1.00      0.99      1.00      6487
           1       0.97      1.00      0.98      1439
           2       1.00      1.00      1.00     22652
           3       1.00      1.00      1.00     14053
           5       1.00      1.00      1.00      8978

    accuracy                           1.00     53609
   macro avg       0.99      1.00      1.00     53609
weighted avg       1.00      1.00      1.00     53609



In [9]:
#Random Forest
#create a random forest classifier
rfc = RFC()
#Set up parameters to fine tune
#tune for best max depth, min samples per leaf and max features
max_depth_lst = list(range(35,56))
param_grid = {'max_depth': max_depth_lst,
              'min_samples_leaf': [8, 10, 12],
              'max_features': ['sqrt', 'log2']}
#grid search and CV
clf = GridSearchCV(rfc, param_grid, cv=5)
Y_pred = cross_val_predict(clf, data_dsample, label_dsample, cv=5)
#Report
report = classification_report(label_dsample, Y_pred)
print(report)


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6487
           1       0.99      1.00      1.00      1439
           2       1.00      1.00      1.00     22652
           3       1.00      1.00      1.00     14053
           5       1.00      1.00      1.00      8978

    accuracy                           1.00     53609
   macro avg       1.00      1.00      1.00     53609
weighted avg       1.00      1.00      1.00     53609



In [10]:
#AdaBoostClassifier
#create an AdaBoostClassifier
abc = ABC()
#params
est_lst = list(range(50,251, 25))
param_grid = {'n_estimators': est_lst}
#grid search 
clf = GridSearchCV(abc, param_grid, cv=5)
#cross validation
Y_pred = cross_val_predict(clf, data_dsample, label_dsample, cv=5)
#Report
report = classification_report(label_dsample, Y_pred)
print(report)


              precision    recall  f1-score   support

           0       1.00      0.76      0.87      6487
           1       0.00      0.00      0.00      1439
           2       1.00      1.00      1.00     22652
           3       0.83      1.00      0.90     14053
           5       1.00      1.00      1.00      8978

    accuracy                           0.94     53609
   macro avg       0.76      0.75      0.75     53609
weighted avg       0.93      0.94      0.93     53609

