#### Carregar o dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble  import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('../dataset/preprocessed-sam-dataset.csv', sep='|',
                 dtype = {'CZ': 'float32', 'FZ': 'float32', 'Fp1': 'float32', 'F3': 'float32',
                          'FC1': 'float32', 'FC5': 'float32', 'FT9': 'float32', 'T7': 'float32',
                          'CP5': 'float32', 'P3': 'float32', 'P7': 'float32', 'PO9': 'float32',
                          'PZ': 'float32', 'O2': 'float32', 'P4': 'float32', 'CP6': 'float32',
                          'FT10': 'float32', 'FC6': 'float32', 'F8': 'float32', 'Fp2': 'float32',
                          'Scale': 'int8'})


#### Size do dataset

In [2]:
df.shape

(140800, 21)

#### Treinar o modelo e exibir métricas

In [3]:
X = df.drop('Scale', axis = 1)
y = df['Scale']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

scaler = MinMaxScaler()
scaled_trainX = scaler.fit_transform(X_train)
scaled_testX = scaler.transform(X_test)

model = RandomForestClassifier(random_state = 42, n_jobs = 4)
model.fit(scaled_trainX, y_train)
y_pred = model.predict(scaled_testX)

print("Train Accuracy: {:.2f} %".format(model.score(scaled_trainX, y_train) * 100))
print("Test Accuracy: {:.2f} %".format(accuracy_score(y_test, y_pred) * 100))
print('\n')
print("Classifiction Report")
print(classification_report(y_test, y_pred, zero_division = 0))

Train Accuracy: 100.00 %
Test Accuracy: 13.58 %


Classifiction Report
              precision    recall  f1-score   support

           0       0.12      0.10      0.11      3924
           1       0.17      0.27      0.21      3717
           2       0.12      0.13      0.13      3823
           3       0.10      0.11      0.10      3840
           4       0.10      0.11      0.10      3757
           5       0.10      0.08      0.09      3852
           6       0.12      0.09      0.10      3845
           7       0.12      0.09      0.10      3886
           8       0.12      0.09      0.10      3892
           9       0.16      0.17      0.16      3848
          10       0.20      0.26      0.23      3856

    accuracy                           0.14     42240
   macro avg       0.13      0.14      0.13     42240
weighted avg       0.13      0.14      0.13     42240



#### Treinar o modelo utilizando cross validation (10 fold)

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline

X = df.drop('Scale', axis = 1)
y = df['Scale']

pipeline = Pipeline(steps = [
  ("scaler", MinMaxScaler()),  
  ("model", RandomForestClassifier(random_state = 42, n_jobs = 3))
])

strat_k_fold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
scores = cross_validate(pipeline, X, y, cv = strat_k_fold, n_jobs = 3, return_train_score = True)

print("Train Accuracy: {:.2f} %".format(scores['train_score'].mean() * 100))
print("Test Accuracy: {:.2f} %".format(scores['test_score'].mean() * 100))


Train Accuracy: 100.00 %
Test Accuracy: 13.38 %
