In [2]:
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

database = 'MED115'

directory = os.path.join(os.getcwd(), database)
directory_nmbe = os.path.join(directory, 'nmbe.db')
directory_mec = os.path.join(directory, 'mec.db')
directory_texture = os.path.join(directory, 'texture.db')

# Carregar dicionários
with open(directory_nmbe, 'rb') as file:
    dict_nmbe = pickle.load(file)

with open(directory_mec, 'rb') as file:
    dict_mec = pickle.load(file)

with open(directory_texture, 'rb') as file:
    dict_texture = pickle.load(file)

# Printar quantidades de features
sample_nmbe = next(iter(dict_nmbe.values()))
sample_mec = next(iter(dict_mec.values()))
sample_texture = next(iter(dict_texture.values()))

print("Quantidade de features - NMBE:", len(sample_nmbe[1:]))
print("Quantidade de features - MEC:", len(sample_mec[1:]))
print("Quantidade de features - TEXTURE:", len(sample_texture[1:]))

# Concatenar dicionários
concatenated_dict = {}
for d in [dict_nmbe,
          dict_mec, 
          dict_texture
         ]:
    for filename, features in d.items():
        filename_base = os.path.splitext(filename)[0]
        if filename_base in concatenated_dict:
            concatenated_dict[filename_base] = np.concatenate((concatenated_dict[filename_base], features[1:]))
        else:
            concatenated_dict[filename_base] = features

# Separar labels e features
data_labels = [sample[0] for sample in concatenated_dict.values()]
data_features = [sample[1:] for sample in concatenated_dict.values()]

# Normalização
scaler = StandardScaler()
data_features = scaler.fit_transform(data_features)

# PCA
pca = PCA(n_components=40)
data_features = pca.fit_transform(data_features)

# Treino/teste split
X_train, X_test, y_train, y_test = train_test_split(
    data_features, data_labels, test_size=0.3, shuffle=True, stratify=data_labels
)

# Hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# GridSearch com validação cruzada
grid_search = GridSearchCV(
    RandomForestClassifier(class_weight='balanced'),
    param_grid,
    cv=4,
    scoring='f1_weighted',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

# Avaliação no conjunto de teste
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("\nAvaliação no conjunto de teste:")
print(f"Acurácia: {accuracy * 100:.2f}%")
print(f"F1-score: {f1 * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")

# Printar melhores hiperparâmetros
print("\nMelhores hiperparâmetros encontrados:")
for param, value in grid_search.best_params_.items():
    print(f"{param}: {value}")

# Validação cruzada manual com o melhor modelo
kf = StratifiedKFold(n_splits=4, shuffle=True)
acuracias = []
f1_scores = []
precisions = []
recalls = []

for train_index, test_index in kf.split(data_features, data_labels):
    X_train_cv, X_test_cv = np.array(data_features)[train_index], np.array(data_features)[test_index]
    y_train_cv, y_test_cv = np.array(data_labels)[train_index], np.array(data_labels)[test_index]

    best_model.fit(X_train_cv, y_train_cv)
    y_pred_cv = best_model.predict(X_test_cv)

    acuracias.append(accuracy_score(y_test_cv, y_pred_cv))
    f1_scores.append(f1_score(y_test_cv, y_pred_cv, average='weighted'))
    precisions.append(precision_score(y_test_cv, y_pred_cv, average='weighted'))
    recalls.append(recall_score(y_test_cv, y_pred_cv, average='weighted'))

# Métricas médias e desvios-padrão
print("\nValidação cruzada (4 folds):")
for i, (a, f, p, r) in enumerate(zip(acuracias, f1_scores, precisions, recalls)):
    print(f"Fold {i+1}: Acurácia: {a*100:.2f}%, F1-score: {f*100:.2f}%, Precision: {p*100:.2f}%, Recall: {r*100:.2f}%")

print("\nMédias:")
print(f"Acurácia Média: {np.mean(acuracias) * 100:.2f}% (+/- {np.std(acuracias) * 100:.2f}%)")
print(f"F1-score Médio: {np.mean(f1_scores) * 100:.2f}% (+/- {np.std(f1_scores) * 100:.2f}%)")
print(f"Precision Média: {np.mean(precisions) * 100:.2f}% (+/- {np.std(precisions) * 100:.2f}%)")
print(f"Recall Médio: {np.mean(recalls) * 100:.2f}% (+/- {np.std(recalls) * 100:.2f}%)")


Quantidade de features - NMBE: 5
Quantidade de features - MEC: 5
Quantidade de features - TEXTURE: 30

Avaliação no conjunto de teste:
Acurácia: 99.25%
F1-score: 99.24%
Precision: 99.27%
Recall: 99.25%

Melhores hiperparâmetros encontrados:
max_depth: 20
min_samples_leaf: 1
min_samples_split: 2
n_estimators: 500

Validação cruzada (4 folds):
Fold 1: Acurácia: 99.76%, F1-score: 99.76%, Precision: 99.79%, Recall: 99.76%
Fold 2: Acurácia: 99.76%, F1-score: 99.76%, Precision: 99.79%, Recall: 99.76%
Fold 3: Acurácia: 99.41%, F1-score: 99.44%, Precision: 99.61%, Recall: 99.41%
Fold 4: Acurácia: 99.29%, F1-score: 99.30%, Precision: 99.43%, Recall: 99.29%
Fold 5: Acurácia: 99.65%, F1-score: 99.64%, Precision: 99.69%, Recall: 99.65%
Fold 6: Acurácia: 99.53%, F1-score: 99.52%, Precision: 99.58%, Recall: 99.53%
Fold 7: Acurácia: 99.64%, F1-score: 99.64%, Precision: 99.68%, Recall: 99.64%
Fold 8: Acurácia: 99.41%, F1-score: 99.40%, Precision: 99.49%, Recall: 99.41%
Fold 9: Acurácia: 99.41%, F1-sco