In [None]:
import os
import pickle
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from numpy import mean, sqrt, square, arange
from sklearn.feature_selection import chi2,RFE
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC

In [None]:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = "/content/drive/My Drive/Colab Notebooks/ML Assignment/Data/"

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
#read all input csv files in the data folder
all_files = glob.glob(DATA_PATH + "/*.csv")
li = []
sampling_rate = 90

for filename in all_files:
    if ("Sensors_" in filename):
      df = pd.read_csv(filename, index_col=None, header=0)
      n_windows =  df.shape[0] - (df.shape[0]%sampling_rate)
      df1 = df.head(n_windows)
      print("Loaded File:", filename, df.shape,n_windows,df1.shape)
      li.append(df1)

#merge data from all files
dataframe = pd.concat(li, axis=0, ignore_index=True)

Loaded File: /content/drive/My Drive/Colab Notebooks/ML Assignment/Data/Sensors_Handwash_1569696423 - Sensors_Handwash_1569696423.csv (6717, 5) 6660 (6660, 5)
Loaded File: /content/drive/My Drive/Colab Notebooks/ML Assignment/Data/Sensors_Handwash_1570946941 - Sensors_Handwash_1570946941.csv (28889, 5) 28800 (28800, 5)
Loaded File: /content/drive/My Drive/Colab Notebooks/ML Assignment/Data/Sensors_Non-wash_1569696502 - Sensors_Non-wash_1569696502.csv (39860, 5) 39780 (39780, 5)
Loaded File: /content/drive/My Drive/Colab Notebooks/ML Assignment/Data/Sensors_Non-wash_1570946333 - Sensors_Non-wash_1570946333.csv (58189, 5) 58140 (58140, 5)


In [None]:
def get_window_mean_deviation(df, i, blocksize):
  temp = df[i:i+blocksize].to_numpy().flatten()
  rms = sqrt(mean(square(temp)))
  return [np.mean(temp),np.std(temp),np.median(temp),rms]

window_ln = 1
current_column_names = "ACC_x_mean,ACC_x_std,x_median,x_rms,ACC_y_mean,ACC_y_std,y_median,y_rms,ACC_z_mean,ACC_z_std,z_median,z_rms,Label"
current_column_as_array = current_column_names.split(',')

#calculate number of windows
block_size = sampling_rate * window_ln
total_windows = dataframe.shape[0]/block_size

#create a dataframe for windows
windows_df = pd.DataFrame(index=np.arange(0, total_windows), columns=current_column_as_array)
window_idx = 0
i = 0
while  i < len(dataframe):
  #Take a window and calculate it's features (mean-deviation)
  mean_std_acc_X = get_window_mean_deviation(dataframe['rawAccX'], i, block_size)
  mean_std_acc_Y = get_window_mean_deviation(dataframe['rawAccY'], i, block_size)
  mean_std_acc_Z = get_window_mean_deviation(dataframe['rawAccZ'], i, block_size)

  #create a row with features and label of the window and append to dataframe
  a = np.concatenate((mean_std_acc_X,mean_std_acc_Y,mean_std_acc_Z,dataframe['Label'][i]), axis=None)
  windows_df.loc[window_idx] = a

  #iterate to next window
  i += block_size
  window_idx += 1
print("All Windows:",windows_df.shape)

All Windows: (1482, 13)


In [None]:
#optional- save windows dataframe as a csv
windows_df.to_csv(DATA_PATH + 'FT.csv', index= False)

In [None]:
ft_set = [1,2,3,4,5,6]
for no_of_top_features in ft_set:

  X = windows_df.drop('Label',axis=1)
  Y = windows_df['Label'].astype(int)

  #use Recursive feature elimination
  rfe = RFE(DecisionTreeClassifier(), no_of_top_features)
  rfe = rfe.fit(X, Y)
  f = rfe.get_support(1)
  X = X[X.columns[f]]
  columns =  list(X.columns)
  X = windows_df[columns]
  print(f"Top {no_of_top_features} features are:")
  print(list(X)) #X has the top features

  #From data, Take features in X and Label in y
  y= windows_df.Label
  y= y.astype('int')

  #split data into training-testing sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
  print(X_train.shape, y_train.shape)
  print (X_test.shape, y_test.shape)

  #create a Decision Tree Classifer
  clf = SVC(kernel = 'linear')#RandomForestClassifier()

  # Train Decision Tree Classifer
  clf = clf.fit(X_train,y_train)

  #Predict the response for test dataset
  y_pred = clf.predict(X_test)

  #calculate the accuracy of prediction
  print(f"Performance metrics with top {no_of_top_features} features:")
  print("Accuracy is ", accuracy_score(y_test,y_pred)*100)
  print("Confusion matrix is ",confusion_matrix(y_test,y_pred))
  print("-------end--------------")

Top 1 features are:
['ACC_x_mean']
(1185, 1) (1185,)
(297, 1) (297,)
Performance metrics with top 1 features:
Accuracy is  88.21548821548821
Confusion matrix is  [[199  16]
 [ 19  63]]
-------end-----
Top 2 features are:
['ACC_x_mean', 'y_median']
(1185, 2) (1185,)
(297, 2) (297,)
Performance metrics with top 2 features:
Accuracy is  89.56228956228956
Confusion matrix is  [[205  15]
 [ 16  61]]
-------end-----
Top 3 features are:
['ACC_x_mean', 'y_median', 'ACC_z_std']
(1185, 3) (1185,)
(297, 3) (297,)
Performance metrics with top 3 features:
Accuracy is  92.92929292929293
Confusion matrix is  [[196  18]
 [  3  80]]
-------end-----
Top 4 features are:
['ACC_x_mean', 'y_median', 'ACC_z_std', 'z_median']
(1185, 4) (1185,)
(297, 4) (297,)
Performance metrics with top 4 features:
Accuracy is  97.3063973063973
Confusion matrix is  [[209   6]
 [  2  80]]
-------end-----
Top 5 features are:
['ACC_x_mean', 'y_median', 'ACC_z_std', 'z_median', 'z_rms']
(1185, 5) (1185,)
(297, 5) (297,)
Performa