In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import os
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
import torch
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from os import path

In [2]:
PATH = "./TrainingData/"
TEST_PATH = "./TestData/"
val_subject = 5
test_subject = 6

In [3]:
files = []
for filename in os.listdir(PATH):
    files.append(filename)
files = sorted(files, key = lambda x: (int(x.split('_')[1]),int(x.split('_')[2]), x.split('_')[4] ))
files_train = list(filter(lambda x: int(x.split('_')[1]) not in  [val_subject, test_subject], files))
files_val = list(filter(lambda x: int(x.split('_')[1]) == val_subject, files))
files_test = list(filter(lambda x: int(x.split('_')[1])== test_subject, files))

Concatinating features and timestamp, by default down sample the train frequency to 10hz by only considering one entry out of 4, If up sample find the nearest label and replace the nan value while merging

In [4]:
def make_train(files,X,Y,freq_down=True):
    for i in range(0, len(files), 4):
        x_time, x, y_time, y = files[i: i + 4]
        train_df = None
        if freq_down:
            #X_features
            x_time_df = pd.read_csv(PATH + x_time , header=None)
            x_df = pd.read_csv(PATH + x , header=None)
            x_combined = pd.concat([x_time_df, x_df], axis=1, ignore_index=True)
            x_combined = x_combined.loc[range(1,len(x_combined), 4)].reset_index()  # down sampled the frequency

            #Y_labels
            y_time_df = pd.read_csv(PATH + y_time , header=None)
            y_df = pd.read_csv(PATH + y , header=None)
            y_combined = pd.concat([y_time_df, y_df], axis=1, ignore_index=True)

            train_df = pd.concat([x_combined, y_combined], axis=1, ignore_index=True)
            train_df = train_df.drop(columns=[0, 1, 8])  # Dropping the time stamp
            Y.extend(train_df[9].values)
            X.extend(train_df.drop(columns=[9]).values)
        else:
            #X_features
            x_time_df = pd.read_csv(PATH + x_time, header=None)
            x_time_df.astype('float64')
            x_df = pd.read_csv(PATH + x, header=None)
            x_combined = pd.concat([x_time_df, x_df], axis=1, ignore_index=True)
            x_combined = x_combined.rename({0:'timestamp'}, axis='columns')
            x_combined.set_index('timestamp', inplace=True)

            #Y_labels
            y_time_df = pd.read_csv(PATH + y_time , header=None)
            y_time_df.astype('float64')
            y_df = pd.read_csv(PATH + y, header=None)
            y_combined = pd.concat([y_time_df, y_df], axis=1, ignore_index=True)
            y_combined = y_combined.rename({0:'timestamp'}, axis='columns')
            y_combined.set_index('timestamp', inplace=True)
            train_df = pd.merge_asof(left=x_combined, right=y_combined, left_index=True, right_index = True, direction='nearest')  # Merging using the neareset values
            X.extend(train_df.drop(columns=['1_y']).values)
            Y.extend(train_df['1_y'].values)
        
    print(len(X), len(Y))
    return X, Y

In [5]:
X_total, y_total = [], []
make_train(files, X_total, y_total, freq_down=False)
X_train, y_train = [], []
make_train(files_train, X_train, y_train,freq_down=False)
X_val, y_val = [], []
make_train(files_val, X_val, y_val,freq_down=False)
X_test, y_test = [], []
make_train(files_test, X_test, y_test,freq_down=False)
print("Dataset Seperation Done")

1341646 1341646
1053327 1053327
135503 135503
152816 152816
Dataset Seperation Done


This is not really required now, because testing is done using a specific subject

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    stratify=Y, 
                                                    test_size=0.2)

In [6]:
X_train = np.array(X_train)
y_train = np.array(y_train)

In [7]:
weight  = compute_class_weight(class_weight = 'balanced', classes = np.unique(y_train), y = np.array(y_train))
weight_dict = {}
for index, weight in enumerate(weight):
    weight_dict[index] = float(weight)
print(weight_dict)

{0: 0.3403173111841434, 1: 5.9620483155225505, 2: 4.4457683346839545, 3: 1.4949799595785267}


In [32]:
Counter(y_train)

Counter({0: 773783, 1: 44168, 2: 59232, 3: 176144})

In [49]:
clf = RandomForestClassifier(n_estimators = 200,  bootstrap = True, random_state = 42, class_weight=weight_dict).fit(X_train, y_train)


In [27]:
clf_2 = RandomForestClassifier(n_estimators = 1000, max_depth = 12, max_features = 'auto', bootstrap = True, random_state = 42, class_weight=weight_dict).fit(X_train, y_train)


In [53]:
import pickle
with open('./model/rf_200', 'wb+') as f:
    pickle.dump(clf,f)

with open('./model/rf_1000_12', 'wb+') as f:
    pickle.dump(clf_2,f)

In [50]:
pred = clf.predict(X_val)

In [51]:
print(f'Predictions: {Counter(pred)}, Actual: {Counter(y_val)}')

Predictions: Counter({0: 124996, 2: 6143, 3: 3055, 1: 1309}), Actual: Counter({0: 112011, 3: 11292, 2: 7004, 1: 5196})


In [52]:
print(classification_report(y_true=y_val, y_pred=pred))

              precision    recall  f1-score   support

           0       0.86      0.96      0.91    112011
           1       0.47      0.12      0.19      5196
           2       0.64      0.56      0.60      7004
           3       0.16      0.04      0.07     11292

    accuracy                           0.83    135503
   macro avg       0.53      0.42      0.44    135503
weighted avg       0.78      0.83      0.79    135503



In [8]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 1500, num = 3)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 20, num = 4)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [500, 1000, 1500], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, None], 'min_samples_split': [2, 5], 'min_samples_leaf': [2, 4], 'bootstrap': [True, False]}


In [11]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model

In [12]:
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


Evaluating on test data

In [54]:
test_path = "./TestData/"
target_path = "./output/random_forest/"
test_overlap = 30
test_files = sorted(os.listdir(test_path))
for i in range(0, len(test_files), 3):
    X = []
    x_time, x, y_time = test_files[i: i + 3]
    train_df = None
    
    #X_features
    x_time_df = pd.read_csv(test_path + x_time , header=None)
    x_df = pd.read_csv(test_path + x , header=None)
    x_combined = pd.concat([x_time_df, x_df], axis=1, ignore_index=True)
    x_combined = x_combined.loc[range(1,len(x_combined), 4)].reset_index()  # down sampled the frequency
    y_time_df = pd.read_csv(test_path + y_time , header=None)

    # train_df = pd.concat([x_combined, y_combined], axis=1, ignore_index=True)
    train_df = x_combined.drop(columns=[0, 1])  # Dropping the time stamp
    X = np.array(train_df.values)
    pred = []
    pred = clf.predict(X)
    y_df = pd.DataFrame(pred)
    filename = path.join(target_path,(x_time.split('__')[0] + '__y.csv'))
    y_df.to_csv(filename, index=False, header=None)

In [56]:
svm_clf = SVC(kernel="linear", class_weight=weight_dict)

In [57]:
svm_clf.fit(X_train, y_train)

In [None]:
import pickle
with open('./model/svm_imbalanced', 'wb+') as f:
    pickle.dump(svm_clf,f)