In [None]:
import pickle
import os
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

# --- Carregamento de arquivos ---
database = 'MED115'
directory = os.path.join(os.getcwd(), database)
dicts_paths = {
    'nmbe': os.path.join(directory, 'nmbe.db'),
    'mec': os.path.join(directory, 'mec.db'),
    'lbp': os.path.join(directory, 'lbp_hf.db')
}

dicts = {}
for name, path in dicts_paths.items():
    with open(path, 'rb') as f:
        dicts[name] = pickle.load(f)

# --- Concatenação dos dicionários ---
concatenated_dict = {}
for d in dicts.values():
    for filename, features in d.items():
        filename_base = os.path.splitext(filename)[0]
        if filename_base in concatenated_dict:
            concatenated_dict[filename_base] = np.concatenate((concatenated_dict[filename_base], features[1:]))
        else:
            concatenated_dict[filename_base] = features

# --- Separação em features e labels ---
data_labels = [sample[0] for sample in concatenated_dict.values()]
data_features = [sample[1:] for sample in concatenated_dict.values()]
X_train, X_test, y_train, y_test = train_test_split(data_features, data_labels, test_size=0.3, shuffle=True)

# --- Hiperparâmetros para RandomizedSearch ---
param_dist = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'max_depth': [None, 10, 30, 50, 70],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced']
}

# --- RandomizedSearchCV ---
print("Rodando RandomizedSearchCV para RandomForest...")
rf = RandomForestClassifier()
random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=50,
    cv=4,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=2,
    random_state=42
)
random_search.fit(X_train, y_train)
best_estimator = random_search.best_estimator_

# --- Validação cruzada com métricas ---
kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
metrics = {'accuracy': [], 'f1': [], 'precision': [], 'recall': []}

for train_idx, test_idx in kf.split(data_features, data_labels):
    X_tr, X_te = np.array(data_features)[train_idx], np.array(data_features)[test_idx]
    y_tr, y_te = np.array(data_labels)[train_idx], np.array(data_labels)[test_idx]

    best_estimator.fit(X_tr, y_tr)
    y_pred = best_estimator.predict(X_te)

    metrics['accuracy'].append(accuracy_score(y_te, y_pred))
    metrics['f1'].append(f1_score(y_te, y_pred, average='weighted'))
    metrics['precision'].append(precision_score(y_te, y_pred, average='weighted'))
    metrics['recall'].append(recall_score(y_te, y_pred, average='weighted'))

# --- Exibição dos resultados ---
avg_metrics = {k: np.mean(v) for k, v in metrics.items()}
std_metrics = {k: np.std(v) for k, v in metrics.items()}

print("\nMelhores hiperparâmetros encontrados:")
print(random_search.best_params_)

for i in range(4):
    print(f"Fold {i+1} - Acc: {metrics['accuracy'][i]*100:.2f}%, "
          f"F1: {metrics['f1'][i]*100:.2f}%, "
          f"Precision: {metrics['precision'][i]*100:.2f}%, "
          f"Recall: {metrics['recall'][i]*100:.2f}%")

print(f"\nMÉDIAS -> Acc: {avg_metrics['accuracy']*100:.2f}%, "
      f"F1: {avg_metrics['f1']*100:.2f}%, "
      f"Precision: {avg_metrics['precision']*100:.2f}%, "
      f"Recall: {avg_metrics['recall']*100:.2f}%")

print(f"DESVIOS -> Acc: {std_metrics['accuracy']*100:.2f}%, "
      f"F1: {std_metrics['f1']*100:.2f}%, "
      f"Precision: {std_metrics['precision']*100:.2f}%, "
      f"Recall: {std_metrics['recall']*100:.2f}%")
