#### Load the dataset.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('../dataset/preprocessed-sam-dataset.csv', sep='|',
                 dtype = {'CZ': 'float32', 'FZ': 'float32', 'Fp1': 'float32', 'F3': 'float32',
                          'FC1': 'float32', 'FC5': 'float32', 'FT9': 'float32', 'T7': 'float32',
                          'CP5': 'float32', 'P3': 'float32', 'P7': 'float32', 'PO9': 'float32',
                          'PZ': 'float32', 'O2': 'float32', 'P4': 'float32', 'CP6': 'float32',
                          'FT10': 'float32', 'FC6': 'float32', 'F8': 'float32', 'Fp2': 'float32',
                          'Scale': 'int8'})


#### Display the dataset size.

In [2]:
df.shape

(140800, 21)

#### Train the model and display metrics.

In [3]:
X = df.drop('Scale', axis = 1)
y = df['Scale']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

scaler = MinMaxScaler()
scaled_trainX = scaler.fit_transform(X_train)
scaled_testX = scaler.transform(X_test)

model = XGBClassifier(tree_method = 'gpu_hist')
model.fit(scaled_trainX, y_train)
y_pred = model.predict(scaled_testX)

print("Train Accuracy: {:.2f} %".format(model.score(scaled_trainX, y_train) * 100))
print("Test Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred) * 100))
print('\n')
print("Classifiction Report")
print(classification_report(y_test, y_pred, zero_division = 0))


Train Accuracy: 47.06 %
Test Accuracy: 14.69 %


Classifiction Report
              precision    recall  f1-score   support

           0       0.13      0.07      0.09      3820
           1       0.19      0.27      0.22      3843
           2       0.13      0.16      0.14      3924
           3       0.12      0.10      0.11      3870
           4       0.11      0.10      0.11      3808
           5       0.10      0.09      0.09      3823
           6       0.12      0.08      0.10      3820
           7       0.12      0.09      0.10      3890
           8       0.14      0.12      0.13      3809
           9       0.17      0.21      0.19      3862
          10       0.21      0.32      0.25      3771

    accuracy                           0.15     42240
   macro avg       0.14      0.15      0.14     42240
weighted avg       0.14      0.15      0.14     42240



#### Train the model using cross validation (3 fold) and display metrics.

In [4]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline

X = df.drop('Scale', axis = 1)
y = df['Scale']

pipeline = Pipeline(steps = [
  ("scaler", MinMaxScaler()),  
  ("model", XGBClassifier(tree_method = 'gpu_hist'))
])

strat_k_fold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
scores = cross_validate(pipeline, X, y, cv = strat_k_fold, n_jobs = 3, return_train_score = True)

print("Train Accuracy: {:.2f} %".format(scores['train_score'].mean() * 100))
print("Test Accuracy: {:.2f} %".format(scores['test_score'].mean() * 100))


Train Accuracy: 48.29 %
Test Accuracy: 14.50 %
