Machine-Learning Pipeline for BME261L

In [66]:
#Imports
import warnings 
warnings.simplefilter('ignore')
import pandas as pd 
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt

#models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import AdaBoostClassifier as ABC
#other 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

%matplotlib inline

In [49]:
#Read in CSV
col_names = ['heart_rate', 'motion', 'time', 'labels']
heart_rate_txt = 'C:/Users/sarah/OneDrive/Documents/BME261L/Data_Analysis/Dummy_Data/heart_rate/46343_heartrate.txt'
motion_txt = 'C:/Users/sarah/OneDrive/Documents/BME261L/Data_Analysis/Dummy_Data/motion/46343_acceleration.csv'
label_txt = 'C:/Users/sarah/OneDrive/Documents/BME261L/Data_Analysis/Dummy_Data/labels/46343_labeled_sleep.csv'
heart_rate = pd.read_csv(heart_rate_txt, names=['time','heart_rate'])
motion = pd.read_csv(motion_txt, names=['time','motion_x', 'motion_y', 'motion_z'])
labels = pd.read_csv(label_txt, names=['time', 'label'])

#preview data
#print(heart_rate.shape)
#print(motion.shape)
#print(labels.shape)
#print(heart_rate.head(10))
#print(motion.head(10))
#print(labels.head(10))

#I will just use the first 554 data points, even though I don't
#think this is the actual correct labeled data. We will not be
#using this data to select the actual model, so I am not concerned.

#create one df
data = pd.DataFrame()
data['time'] = labels.iloc[:, 0]
#append heart rate
data['heart_rate'] = heart_rate.iloc[0:554, 1]
#append motion
data['motion_x'] = motion.iloc[0:554, 1]
data['motion_y'] = motion.iloc[0:554, 2]
data['motion_z'] = motion.iloc[0:554, 3]
#append labels
data['label'] = labels.iloc[:, 1]
print(data.head(10))
print(data.shape)

   time  heart_rate  motion_x  motion_y  motion_z  label
0   390          57  0.017487 -0.586700 -0.805771      0
1   420          56  0.018982 -0.589676 -0.809158      0
2   450          56  0.020966 -0.580887 -0.815048      0
3   480          57  0.019485 -0.580872 -0.813583      0
4   510          59  0.016998 -0.587204 -0.806259      0
5   540          61  0.019959 -0.593094 -0.806198      0
6   570          98  0.024399 -0.586258 -0.811585      0
7   600          90  0.017929 -0.565567 -0.803955      0
8   630          94  0.018967 -0.579376 -0.810684      0
9   660          88  0.033249 -0.592117 -0.807114      0
(554, 6)


In [50]:
#seperate features from label
data_X = data.iloc[:, 0:5]
data_Y = data.iloc[:, 5]

In [55]:
#KNN

#create a scaler
scaler = MinMaxScaler()

#create a KNN classifier
knn = KNeighborsClassifier()
#create a pipeline that does scaling, then KNN (prevent data leakage)
pipe = Pipeline(steps=[('scaler', scaler), ('knn', knn)])
#Set up parameters to fine tune
#check nearest neighbors 1-30
param_grid = {
    'knn__n_neighbors': list(range(1, 30))
}
#Pass the pipeline and the parameters into a GridSearchCV with a 5-fold CV
clf = GridSearchCV(pipe, param_grid, cv=5)
Y_pred = cross_val_predict(clf, data_X, data_Y, cv=5)
#Report
report = classification_report(data_Y, Y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.48      0.58      0.52        85
           1       0.07      0.07      0.07        29
           2       0.59      0.32      0.42       170
           3       0.62      0.67      0.64       156
           5       0.55      0.77      0.64       114

   micro avg       0.54      0.54      0.54       554
   macro avg       0.46      0.48      0.46       554
weighted avg       0.54      0.54      0.52       554



In [53]:
#Naive Bayes
#create a NB classifier
clf = GaussianNB()
#fit with a 10-fold CV
Y_pred = cross_val_predict(clf, data_X, data_Y, cv=10)
#Report
report = classification_report(data_Y, Y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.19      0.05      0.08        85
           1       0.00      0.00      0.00        29
           2       0.41      0.55      0.47       170
           3       0.31      0.42      0.36       156
           5       0.16      0.13      0.15       114

   micro avg       0.32      0.32      0.32       554
   macro avg       0.21      0.23      0.21       554
weighted avg       0.27      0.32      0.29       554



In [58]:
#SVM
#create a scalar
scalar = MinMaxScaler()
#create a SVM classifier
svm = SVC()
#create a pipeline that does scaling, then SVM
pipe = Pipeline(steps=[('scaler', scaler), ('svm', svm)])
#Set up parameters to fine tune
#tune for best kernel
param_grid = {
    'svm__kernel':['linear', 'rbf', 'poly', 'sigmoid']
    }
#grid search and CV
clf = GridSearchCV(pipe, param_grid, cv=5)
Y_pred = cross_val_predict(clf, data_X, data_Y, cv=5)
#Report
report = classification_report(data_Y, Y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        85
           1       0.00      0.00      0.00        29
           2       0.37      0.56      0.44       170
           3       0.54      0.56      0.55       156
           5       0.35      0.41      0.38       114

   micro avg       0.42      0.42      0.42       554
   macro avg       0.25      0.31      0.28       554
weighted avg       0.34      0.42      0.37       554



In [60]:
#Neural Net
#create a scalar
scalar = MinMaxScaler()
#create a Nueral Net classifier
mlp = MLPClassifier()
#create a pipeline that does scaling, then SVM
pipe = Pipeline(steps=[('scaler', scaler), ('mlp', mlp)])
#Set up parameters to fine tune
#tune for best hidden layer size and activation
param_grid = {
    'mlp__hidden_layer_sizes': [(10,), (20,), (30,),
                                (40,), (50,), (60,)],
    'mlp__activation': ['logistic', 'tanh', 'relu']
    }
#grid search and CV
clf = GridSearchCV(pipe, param_grid, cv=5)
Y_pred = cross_val_predict(clf, data_X, data_Y, cv=5)
#Report
report = classification_report(data_Y, Y_pred)
print(report)



              precision    recall  f1-score   support

           0       0.31      0.06      0.10        85
           1       0.00      0.00      0.00        29
           2       0.36      0.48      0.41       170
           3       0.55      0.67      0.60       156
           5       0.33      0.36      0.35       114

   micro avg       0.42      0.42      0.42       554
   macro avg       0.31      0.31      0.29       554
weighted avg       0.38      0.42      0.38       554



In [65]:
#Random Forest
#create a random forest classifier
rfc = RFC()
#Set up parameters to fine tune
#tune for best max depth, min samples per leaf and max features
max_depth_lst = list(range(35,56))
param_grid = {'max_depth': max_depth_lst,
              'min_samples_leaf': [8, 10, 12],
              'max_features': ['sqrt', 'log2']}
#grid search and CV
clf = GridSearchCV(rfc, param_grid, cv=5)
Y_pred = cross_val_predict(clf, data_X, data_Y, cv=5)
#Report
report = classification_report(data_Y, Y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.65      0.66      0.65        85
           1       0.00      0.00      0.00        29
           2       0.52      0.44      0.48       170
           3       0.64      0.83      0.73       156
           5       0.64      0.65      0.65       114

   micro avg       0.60      0.60      0.60       554
   macro avg       0.49      0.52      0.50       554
weighted avg       0.57      0.60      0.58       554



In [68]:
#AdaBoostClassifier
#create an AdaBoostClassifier
abc = ABC()
#params
est_lst = list(range(50,251, 25))
param_grid = {'n_estimators': est_lst}
#grid search 
clf = GridSearchCV(abc, param_grid, cv=5)
#cross validation
Y_pred = cross_val_predict(clf, data_X, data_Y, cv=5)
#Report
report = classification_report(data_Y, Y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.51      0.31      0.38        85
           1       0.19      0.10      0.13        29
           2       0.34      0.56      0.43       170
           3       0.49      0.52      0.50       156
           5       0.87      0.35      0.50       114

   micro avg       0.44      0.44      0.44       554
   macro avg       0.48      0.37      0.39       554
weighted avg       0.51      0.44      0.44       554

