In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

data = pd.read_csv('cop_training_dataset.csv')

# Splitting data into features and target
X = data.drop(columns=['subject', 'task', 'window_id','class'])
y = data['class']

##ALL FEATURES

In [None]:
clf = GradientBoostingClassifier(random_state=42)

In [None]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


In [None]:
n_estimators = [150,200,250]
learning_rate = [0.1, 0.01]
max_depth = [5,7,10]
min_samples_split = [4, 6, 10, 15]

In [None]:
param_grid = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'min_samples_split': min_samples_split,
               'max_depth': max_depth,
             }
print(param_grid)

{'n_estimators': [150, 200, 250], 'learning_rate': [0.1, 0.01], 'min_samples_split': [4, 6, 10, 15], 'max_depth': [5, 7, 10]}


In [None]:
from sklearn.model_selection import GridSearchCV

clf_grid_search = GridSearchCV(clf,
                           param_grid=param_grid,
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

clf_grid_search.fit(X, y)

report(clf_grid_search.cv_results_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Model with rank: 1
Mean validation score: 0.660 (std: 0.067)
Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'min_samples_split': 15, 'n_estimators': 250}

Model with rank: 2
Mean validation score: 0.660 (std: 0.071)
Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'min_samples_split': 15, 'n_estimators': 200}

Model with rank: 3
Mean validation score: 0.649 (std: 0.062)
Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'min_samples_split': 15, 'n_estimators': 150}



In [None]:
best_params = clf_grid_search.best_params_
best_clf = GradientBoostingClassifier(**best_params)
scores = cross_validate(best_clf, X, y, cv=5)

print("Accuracy per fold: ", scores['test_score'])
print("Average Accuracy: %0.2f" % (np.mean(scores['test_score'])))
print("Standard Deviation of Accuracy: +/- %0.2f" % (np.std(scores['test_score'])))

Accuracy per fold:  [0.75       0.6043956  0.59340659 0.61538462 0.71428571]
Average Accuracy: 0.66
Standard Deviation of Accuracy: +/- 0.06


In [None]:
from scipy.stats import binomtest

n_samples1 = len(data)  # total number of samples
successful_pred1 = int(n_samples1 * 0.66)  # total number of successful predictions
p_value1 = binomtest(successful_pred1, n_samples1, p=0.5, alternative='greater')
p_value1

BinomTestResult(k=300, n=456, alternative='greater', statistic=0.6578947368421053, pvalue=7.278525757627242e-12)

##SELECTED FEATURES

In [None]:
X1 = data[['acc_x_dominant_freq', 'gyr_y_dominant_freq',
                'gyr_z_dominant_amplitude']]

In [None]:
n_estimators1 = [120,150,200]
learning_rate1 = [0.1, 0.01]
max_depth1 = [5,7,10]
min_samples_split1 = [4, 6, 10]

In [None]:
param_grid1 = {'n_estimators': n_estimators1,
               'learning_rate': learning_rate1,
               'min_samples_split': min_samples_split1,
               'max_depth': max_depth1,
             }
print(param_grid1)

{'n_estimators': [120, 150, 200], 'learning_rate': [0.1, 0.01], 'min_samples_split': [4, 6, 10], 'max_depth': [5, 7, 10]}


In [None]:
from sklearn.model_selection import GridSearchCV

clf_grid_search1 = GridSearchCV(clf,
                           param_grid=param_grid1,
                           cv=5,
                           verbose=2,
                           n_jobs=-1)

clf_grid_search1.fit(X1, y)

report(clf_grid_search1.cv_results_)

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Model with rank: 1
Mean validation score: 0.590 (std: 0.038)
Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 150}

Model with rank: 2
Mean validation score: 0.581 (std: 0.027)
Parameters: {'learning_rate': 0.01, 'max_depth': 10, 'min_samples_split': 4, 'n_estimators': 150}

Model with rank: 3
Mean validation score: 0.581 (std: 0.043)
Parameters: {'learning_rate': 0.01, 'max_depth': 5, 'min_samples_split': 10, 'n_estimators': 200}



In [None]:
best_params1 = clf_grid_search1.best_params_
best_clf1 = GradientBoostingClassifier(**best_params1)
scores1 = cross_validate(best_clf1, X1, y, cv=5)

print("Accuracy per fold: ", scores1['test_score'])
print("Average Accuracy: %0.2f" % (np.mean(scores1['test_score'])))
print("Standard Deviation of Accuracy: +/- %0.2f" % (np.std(scores1['test_score'])))

Accuracy per fold:  [0.60869565 0.54945055 0.64835165 0.59340659 0.54945055]
Average Accuracy: 0.59
Standard Deviation of Accuracy: +/- 0.04


In [None]:
from scipy.stats import binomtest

n_samples1 = len(data)  # total number of samples
successful_pred1 = int(n_samples1 * 0.59)  # total number of successful predictions
p_value1 = binomtest(successful_pred1, n_samples1, p=0.5, alternative='greater')
p_value1

BinomTestResult(k=269, n=456, alternative='greater', statistic=0.5899122807017544, pvalue=7.160417505405845e-05)