In [None]:
import matplotlib.pyplot as plt
import numpy as numpy
import pandas as pd

In [None]:
from sklearn.utils import shuffle

train_data = pd.read_csv('masterfile.csv') # Load training data
test_data = pd.read_csv('mastertestfile.csv') # Load test data
train_data = shuffle(train_data)

print(train_data)
print(test_data)

                                                r_arm  ...                 file
7   [172.62, 170.52, 171.35, 171.26, 172.7, 172.02...  ...  20211024_222035.mp4
53  [173.89, 175.34, 174.72, 173.41, 172.69, 172.8...  ...         Good PY3.mp4
4   [162.02, 162.32, 163.42, 163.67, 162.9, 164.1,...  ...  20211024_221859.mp4
19  [166.55, 166.54, 162.52, 160.17, 159.46, 151.0...  ...          PY Bad2.mp4
52  [175.41, 177.45, 175.5, 173.95, 172.03, 173.82...  ...         Good PY2.mp4
..                                                ...  ...                  ...
66  [283.32, 289.33, 289.23, 296.73, 299.68, 304.1...  ...        ry_good_3.MOV
58  [174.98, 173.59, 174.23, 174.2, 174.24, 173.53...  ...         Good PY8.mp4
1   [163.26, 164.05, 164.12, 164.52, 164.86, 164.8...  ...  20211024_221843.mp4
38  [164.94, 164.74, 164.36, 164.71, 164.3, 164.31...  ...  20211024_221812.mp4
33  [109.08, 79.11, 85.66, 93.44, 84.1, 80.3, 80.5...  ...         ry_bad_6.MOV

[73 rows x 8 columns]
                 

In [None]:
# Split up the data to x and y 

X_train = train_data.drop(['file','class'],1)
y_train = train_data['class']

X_test = test_data.drop(['file','class'],1)
y_test = test_data['class']

7     0
53    1
4     0
19    0
52    1
     ..
66    1
58    1
1     0
38    1
33    0
Name: class, Length: 73, dtype: int64


In [None]:
# Use normalization 
from sklearn.preprocessing import MinMaxScaler

# define a new scaler: 
x_scaler = MinMaxScaler()

# fit the normalization on the training set: 
x_scaler.fit(X_train,y_train) #fit X and y training sets

# then create new and normalized training/test sets: 
X_train_norm = x_scaler.transform(X_train)
X_test_norm = x_scaler.transform(X_test)

ValueError: ignored

### Define a evaluation metric


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score

def evaluate_on_training_set(y_test, y_pred):
  # Calculate AUC
  print("AUC is: ", roc_auc_score(y_test, y_pred), "\n")

  # print out recall and precision
  print(classification_report(y_test, y_pred))

  # print out confusion matrix
  print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
  
  # # calculate points for ROC curve
  fpr, tpr, thresholds = roc_curve(y_test, y_pred)
  
  # Plot ROC curve
  plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc_score(y_test, y_pred))
  plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.0])
  plt.xlabel('False Positive Rate or (1 - Specifity)')
  plt.ylabel('True Positive Rate or (Sensitivity)')
  plt.title('Receiver Operating Characteristic')

### Using Decision trees

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
model = tree.DecisionTreeClassifier(max_depth=10,min_samples_leaf=1)
# Using default parameters

model.fit(X_train,y_train)

y_pred = model.predict(X_test) # Predicting labels for our test set using model
print (y_pred)
evaluate_on_training_set(y_test, y_pred) #evaluate our model using new function

ValueError: ignored

### Using KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=20) # Define the model with parameters
model.fit(X_train_norm, y_train) # Training the model

# Evaluate the model: 
y_pred = model.predict(X_test_norm) # Predicting labels for our test set using trained model
print (y_pred)
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

### Using logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C=1.0, multi_class='auto', solver='liblinear') #(regularization parameter, detect classes auto, optimzation algorithm )
# Define the model with parameters

model.fit(X_train_norm, y_train) # Training the model
y_pred = model.predict(X_test_norm) # Predicting labels for our test set using trained model
print(y_pred)
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

### Using Gaussian NB

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB() # Define the model with parameters

model.fit(X_train_norm, y_train) # Training the model

y_pred = model.predict(X_test_norm) # Predicting labels for our test set using trained model
print(y_pred)
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

### Using SVM


In [None]:
from sklearn.svm import SVC
model = SVC(C=10, gamma='auto', kernel='linear')

model.fit(X_train_norm, y_train) # Training SVM

y_pred = model.predict(X_test_norm) # Predicting labels for our test set using trained model
print(y_pred)
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

tuned_parameters = [{'kernel': ['rbf'], #radial basis function
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['linear'],
                     'C': [1, 10, 100]}]

# we define the grid search model for SVM: 
clf = GridSearchCV(SVC(), tuned_parameters, cv=2,
                   scoring='roc_auc', verbose=1, n_jobs=4)

# train the model on the training set: 
clf.fit(X_train_norm, y_train)

# Show best parameters: 
print("Best parameter set found on development set:")
print(clf.best_params_, '\n')

y_pred = clf.predict(X_test_norm) #create predictions
evaluate_on_training_set(y_test, y_pred) # evaluate like we always do

### Using AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=10000, learning_rate=0.01) # Define the model with parameters

model.fit(X_train, y_train) # Training the model
y_pred = model.predict(X_test) # Predicting labels for our test set using trained model
print(y_pred)
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

### Using RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 50) # Define the model

#TODO fit the model, predict y and evaluate as before
model.fit(X_train_norm,y_train)

y_pred = model.predict(X_test_norm) # Predicting labels for our test set using trained model
evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function

### Using all

Tune parameters in this part to get best accuracy on classification

In [None]:
# 5. Train and evaluate multiple models (decision tree, svm with grid search, logistic regression, embedded models) to find the best classifier.

from scipy._lib.six import iteritems
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier


tuned_parameters = [{'kernel': ['rbf'], 
                     'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100]},
                    {'kernel': ['liblinear'],
                     'C': [1, 10, 100]}]
model_dict = {
             'Decision Tree' : DecisionTreeClassifier(max_depth=3,min_samples_leaf=1),
              'KNN' :KNeighborsClassifier(n_neighbors = 15),
              'Logistic Regression' : LogisticRegression(C=1.0,multi_class='auto',solver='sag'), #vary solver
              'Naive Bayes Gaussian': GaussianNB(),
              'SVM' : SVC(C=10,gamma='auto',kernel='rbf'), #vary kernel
              'SVM w Grid Search': GridSearchCV(SVC(),tuned_parameters,cv=2,verbose = 1, n_jobs = 4), #Tweak tuned_parameters
              'Ensemble models' : AdaBoostClassifier(n_estimators = 100,learning_rate = 0.1,random_state=0), #vary learning_rate, n_estimators is large enough 
              'Random Forest' : RandomForestClassifier(n_estimators = 50,oob_score = True), #Vary n_estimators
              'MLP Classifier' : MLPClassifier(hidden_layer_sizes = [100]*5,random_state = 1,max_iter = 300), #USING NEURAL NETWORKS, depth 100, width 5
              'Gradient Booster': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0),
              'Stochastic Gradient Descent classifier': SGDClassifier(loss='modified_huber',shuffle=True,random_state=101)
        }

score_matrix_dict = {             
              'Decision Tree' : [],
              'KNN' : [],
              'Logistic Regression' :[], 
              'Naive Bayes Gaussian': [], 
              'SVM' : [],
              'SVM w Grid Search': [],
              'Ensemble models' : [],
              'Random Forest' : [],
              'MLP Classifier' : [],
              'Gradient Booster' : [],
              'Stochastic Gradient Descent classifier' : []
}

max_score = 0

# IMPORTANT: Evaluate only in terms of classification_report and confusion matrix. (No need for AUC and ROC here, as we are doing multiclass classification, the target label would need to be binarized first, which we are not doing.)

# run through each model in the list, use X_train_norm
for key,value in model_dict.items():
  model_norm = value #assigning a model to norm values

  model_norm.fit(X_train_norm,y_train)

  y_pred_norm = model_norm.predict(X_test_norm)
  
  score_matrix_dict[key].append(confusion_matrix(y_test,y_pred_norm)) #store the confusion matrix
  score_matrix_dict[key].append(accuracy_score(y_test,y_pred_norm))
  score_matrix_dict[key].append(classification_report(y_test,y_pred_norm))
  
  print(key)
  print(classification_report(y_test,y_pred_norm))
  print("\n")
  # evaluate_on_training_set(y_test, y_pred) #evaluate our model using newly defined function
  
print(score_matrix_dict)

### Using cross val

Model checking only, not used for model building

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix, roc_auc_score

# Change solver accordingly
# solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’
clf = LogisticRegressionCV(solver='liblinear')

pipeline = Pipeline([('transformer', MinMaxScaler()), ('estimator', clf)])


scores = cross_val_score(pipeline, X_train_norm, y_train, cv=10)
AUCscores = cross_val_score(pipeline, X_train_norm, y_train, cv=10, scoring='roc_auc')


print('The accuracy of each fold: ')
print(scores)

print ('Average accuracy across folds: ' + str(scores.mean()))


print('The AUC of each fold: ')
print(AUCscores)

print ('Average AUC across folds: ' + str(AUCscores.mean()))


print('Final confusion matrix: ')
y_pred = cross_val_predict(pipeline, X_train_norm, y_train, cv=10)
conf_mat = confusion_matrix(y_train, y_pred)

print(conf_mat)


In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import confusion_matrix, roc_auc_score


best_model = DecisionTreeClassifier(max_depth=3,min_samples_leaf=1)

pipeline = Pipeline([('transformer', MinMaxScaler()), ('estimator', best_model)])


scores = cross_val_score(pipeline, X_train_norm, y_train, cv=10) #10-fold cross val
AUCscores = cross_val_score(pipeline, X_train_norm, y_train, cv=10, scoring='roc_auc')


print('The accuracy of each fold: ')
print(scores)

print ('Average accuracy across folds: ' + str(scores.mean()))


print('The AUC of each fold: ')
print(AUCscores)

print ('Average AUC across folds: ' + str(AUCscores.mean()))


print('Final confusion matrix: ')
y_pred = cross_val_predict(pipeline, X_train_norm, y_train, cv=10)
conf_mat = confusion_matrix(y_train, y_pred)

print(conf_mat)