In [12]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import os
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
PATH = "./TrainingData/"
val_subject = 5
test_subject = 6

In [4]:
files = []
for filename in os.listdir(PATH):
    files.append(filename)
files = sorted(files, key = lambda x: (int(x.split('_')[1]),int(x.split('_')[2]), x.split('_')[4] ))
files_train = list(filter(lambda x: int(x.split('_')[1]) not in  [val_subject, test_subject], files))
files_val = list(filter(lambda x: int(x.split('_')[1]) == val_subject, files))
files_test = list(filter(lambda x: int(x.split('_')[1])== test_subject, files))

Concatinating features and timestamp

In [5]:
def make_train(files, X,Y):
    for i in range(0, len(files), 4):
        x_time, x, y_time, y = files[i: i + 4]
        x_time_df = pd.read_csv(PATH + x_time , header=None)
        x_df = pd.read_csv(PATH + x , header=None)
        x_combined = pd.concat([x_time_df, x_df], axis=1, ignore_index=True)
        x_combined = x_combined.loc[range(1,len(x_combined), 4)].reset_index()  # down sampled the frequency
        # print(x_combined.shape)
        y_time_df = pd.read_csv(PATH + y_time , header=None)
        y_df = pd.read_csv(PATH + y , header=None)
        y_combined = pd.concat([y_time_df, y_df], axis=1, ignore_index=True)
        # print(y_combined.shape)
        train_df = pd.concat([x_combined, y_combined], axis=1, ignore_index=True)
        train_df = train_df.drop(columns=[0, 1, 8])  # Dropping the time stamp
        Y.extend(train_df[9].values)
        X.extend(train_df.drop(columns=[9]).values)
    print(len(X), len(Y))
    return X, Y

In [8]:
X_total, y_total = [], []
make_train(files, X_total, y_total)
X_train, y_train = [], []
make_train(files_train, X_train, y_train)
X_val, y_val = [], []
make_train(files_val, X_val, y_val)
X_test, y_test = [], []
make_train(files_test, X_test, y_test)
print("Dataset Seperation Done")

335413 335413
263333 263333
33876 33876
38204 38204
Dataset Seperation Done


This is not really required now, because testing is done using a specific subject

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    stratify=Y, 
                                                    test_size=0.2)

In [9]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [10]:
weight  = compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train), y = np.array(y_train))
weight_dict = {}
for index, weight in enumerate(weight):
    weight_dict[index] = float(weight)
print(weight_dict)

{0: 0.3403167275791302, 1: 5.962076616554972, 2: 4.445789438141545, 3: 1.494987056045054}


In [12]:
Counter(y_train)

Counter({0: 161786, 3: 30446, 1: 9770, 2: 12221})

In [None]:
clf = RandomForestClassifier(n_estimators = 500, max_depth = 4, max_features = 3, bootstrap = True, random_state = 42, class_weight=weight_dict).fit(X_train, y_train)


In [13]:
clf_2 = RandomForestClassifier(n_estimators = 1000, max_depth = 10, max_features = 'auto', bootstrap = True, random_state = 42, class_weight=weight_dict).fit(X_train, y_train)


In [15]:
pred = clf_2.predict(X_test)

In [16]:
print(f'Predictions: {Counter(pred)}, Actual: {Counter(y_test)}')

Predictions: Counter({0: 15632, 3: 9744, 1: 8100, 2: 4728}), Actual: Counter({0: 30283, 3: 4750, 2: 1708, 1: 1463})


In [17]:
print(classification_report(y_true=y_test, y_pred=pred))

              precision    recall  f1-score   support

           0       0.89      0.46      0.61     30283
           1       0.13      0.71      0.22      1463
           2       0.25      0.69      0.36      1708
           3       0.20      0.40      0.26      4750

    accuracy                           0.47     38204
   macro avg       0.37      0.56      0.36     38204
weighted avg       0.75      0.47      0.54     38204



In [8]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1500, num = 3)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 20, num = 4)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [500, 1000, 1500], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, None], 'min_samples_split': [2, 5], 'min_samples_leaf': [2, 4], 'bootstrap': [True, False]}


In [11]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model

In [12]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
print("Check")