In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

In [None]:
# importing the model
from sklearn.ensemble import RandomForestClassifier

In [None]:
subjects_df = pd.read_csv('cop_training_dataset.csv')

##FEATURE SELECTION

In [None]:
# X as features and y as target variable
X = subjects_df[['acc_x_dominant_freq', 'gyr_y_dominant_freq',
                'gyr_z_dominant_amplitude']]
y = subjects_df['class']


In [None]:
rfc = RandomForestClassifier(random_state=42)

In [None]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [None]:
# Number of trees in random forest
n_estimators = [100,150]

# Criterion
criterion = ['gini', 'entropy', 'log_loss']

# Number of features to consider at every split
max_features = [None, 'sqrt', 'log2']

# Maximum number of levels in tree
max_depth = [10,15,20,None]

# Minimum samples split
min_samples_split = [5, 7, 10, 20, 40, None]


In [None]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'criterion': criterion,
               'max_depth': max_depth,
             }
print(param_grid)

{'n_estimators': [100, 150], 'max_features': [None, 'sqrt', 'log2'], 'criterion': ['gini', 'entropy', 'log_loss'], 'max_depth': [10, 15, 20, None]}


In [None]:
from sklearn.model_selection import GridSearchCV

rfc_grid_search = GridSearchCV(rfc,
                           param_grid=param_grid,
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

rfc_grid_search.fit(X, y)

report(rfc_grid_search.cv_results_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Model with rank: 1
Mean validation score: 0.605 (std: 0.034)
Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 150}

Model with rank: 1
Mean validation score: 0.605 (std: 0.034)
Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 150}

Model with rank: 3
Mean validation score: 0.603 (std: 0.035)
Parameters: {'criterion': 'gini', 'max_depth': 15, 'max_features': 'sqrt', 'n_estimators': 150}

Model with rank: 3
Mean validation score: 0.603 (std: 0.035)
Parameters: {'criterion': 'gini', 'max_depth': 15, 'max_features': 'log2', 'n_estimators': 150}



In [None]:
best_params = rfc_grid_search.best_params_
best_rfc = RandomForestClassifier(**best_params, random_state=42)
scores = cross_validate(best_rfc, X, y, cv=5)

print("Accuracy per fold: ", scores['test_score'])
print("Average Accuracy: %0.2f" % (np.mean(scores['test_score'])))
print("Standard Deviation of Accuracy: +/- %0.2f" % (np.std(scores['test_score'])))

Accuracy per fold:  [0.57608696 0.56043956 0.6043956  0.63736264 0.64835165]
Average Accuracy: 0.61
Standard Deviation of Accuracy: +/- 0.03


P-VALUE

In [None]:
from scipy.stats import binomtest

n_samples = len(subjects_df)  # total number of samples
successful_pred = int(n_samples * 0.61)  # total number of successful predictions
p_value = binomtest(successful_pred, n_samples, p=0.5, alternative='greater')
p_value

BinomTestResult(k=278, n=456, alternative='greater', statistic=0.6096491228070176, pvalue=1.6296368715170686e-06)

BEFORE FEATURE SELECTION


In [None]:
# X as features and y as target variable
X1 = subjects_df.drop(columns=['subject', 'task', 'window_id','class'])

In [None]:
# Number of trees in random forest
n_estimators1 = [30,50,60,70]

# Criterion
criterion1 = ['entropy', 'log_loss']

# Number of features to consider at every split
max_features1 = [None, 'sqrt', 'log2']

# Maximum number of levels in tree
max_depth1 = [15,20,None]

# Minimum samples split
min_samples_split1 = [40, 50, None]

In [None]:
param_grid1 = {'n_estimators': n_estimators1,
               'max_features': max_features1,
               'criterion': criterion1,
               'max_depth': max_depth1,
              'min_samples_split' : min_samples_split1
             }
print(param_grid1)

{'n_estimators': [30, 50, 60, 70], 'max_features': [None, 'sqrt', 'log2'], 'criterion': ['entropy', 'log_loss'], 'max_depth': [15, 20, None], 'min_samples_split': [40, 50, None]}


In [None]:
from sklearn.model_selection import GridSearchCV

rfc_grid_search1 = GridSearchCV(rfc,
                           param_grid=param_grid1,
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

rfc_grid_search1.fit(X1, y)

report(rfc_grid_search1.cv_results_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Model with rank: 1
Mean validation score: 0.682 (std: 0.071)
Parameters: {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 50, 'n_estimators': 60}

Model with rank: 1
Mean validation score: 0.682 (std: 0.071)
Parameters: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 50, 'n_estimators': 60}

Model with rank: 1
Mean validation score: 0.682 (std: 0.071)
Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 50, 'n_estimators': 60}

Model with rank: 1
Mean validation score: 0.682 (std: 0.071)
Parameters: {'criterion': 'log_loss', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 50, 'n_estimators': 60}

Model with rank: 1
Mean validation score: 0.682 (std: 0.071)
Parameters: {'criterion': 'log_loss', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 50, 'n_estimators': 60}

Model w

360 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(

In [None]:
best_params1 = rfc_grid_search1.best_params_
best_rfc = RandomForestClassifier(**best_params, random_state=42)
scores = cross_validate(best_rfc, X1, y, cv=5)

print("Accuracy per fold: ", scores['test_score'])
print("Average Accuracy: %0.2f" % (np.mean(scores['test_score'])))
print("Standard Deviation of Accuracy: +/- %0.2f" % (np.std(scores['test_score'])))

Accuracy per fold:  [0.72826087 0.61538462 0.59340659 0.6043956  0.67032967]
Average Accuracy: 0.64
Standard Deviation of Accuracy: +/- 0.05


CHECK IF THE MODEL IS DOING BETTER THAN A RANDOM GUESS

In [None]:
len(subjects_df)

456

In [None]:
from scipy.stats import binomtest

n_samples1 = len(subjects_df)  # total number of samples
successful_pred1 = int(n_samples1 * 0.64)  # total number of successful predictions
p_value1 = binomtest(successful_pred1, n_samples1, p=0.5, alternative='greater')
p_value1

BinomTestResult(k=291, n=456, alternative='greater', statistic=0.6381578947368421, pvalue=1.9280498343531476e-09)