In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import joblib

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix , auc, precision_recall_curve, roc_auc_score, balanced_accuracy_score
from scipy.stats import uniform, randint
from sklearn.model_selection import ParameterGrid, cross_validate
import matplotlib.pyplot as plt

In [None]:
dataset_gold = pd.read_csv('../../assets/gold/experimentos_finales/dataset_final.csv')

In [None]:
max_arboles = [5, 10, 15, 20]
metricas = ["gini", "entropy"]
bootstrap = [True]
max_depth = [3]
min_samples_split= [2]
min_samples_leaf = [1]
max_features = ['sqrt']


def crear_grilla(max_arboles, metricas, bootstrap, min_samples_split, min_samples_leaf, max_features):
    parametros = {
        'n_estimators': max_arboles,
        'criterion': metricas,
        'bootstrap': bootstrap,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features
    }
    grilla = list(ParameterGrid(parametros))
    return grilla

grilla_parametros = crear_grilla(max_arboles, metricas, bootstrap,min_samples_split, min_samples_leaf, max_features)
grilla_parametros

## Experimento 2

5 iteraciones de Random forest con boost. 20% de test se elige en cada iteración.

repetir 5 veces:

    test_i = tomar un 20% de test al azar  (uno distinto en cada iteracion)
    
    x_i = realizar experimiento con boostraping (test_i )


Se toma como promedio, el promedio de las 5 iteraciones.  El propósito es reducir varianza y salir de un test poco afortunado.  Y si es posible, mejor performance.
el experimento 2 no devolvería las métricas de los árboles individuales del random forest


In [None]:
resultados = []
importancias_resultados = []

for params in grilla_parametros:
    accuracy_scores_train = []
    accuracy_scores_test = []
    balanced_accuracy_scores_train = []
    balanced_accuracy_scores_test = []
    auc_scores_train = []
    auc_scores_test = []
    importancias_atributos = []
    print(f"Evaluando parámetros: {params}")
    # Repetir el proceso 5 veces para obtener diferentes splits de entrenamiento y test
    for i in range(5):
        # 1. Separar un 20% random como test, el random state cambia en cada iteracion, entonces da un 20% distinto en cada una
        X_train_base, X_test, y_train_base, y_test = train_test_split(dataset_gold[dataset_gold.columns.difference(['target'])],
                                                                        dataset_gold['target'], test_size=0.2, random_state=i,
                                                                        stratify=dataset_gold['target'])
        
        modelo = RandomForestClassifier(random_state=42, **params)
        modelo.fit(X_train_base, y_train_base)

        # Guardar importancia de atributos
        importancias = modelo.feature_importances_
        importancias_atributos.append(importancias)
        
        # Predicciones de clase
        y_train_pred = modelo.predict(X_train_base)
        y_validation_pred = modelo.predict(X_test)

        # Predicciones de probabilidad para AUC ROC (para la clase positiva)
        y_train_proba = modelo.predict_proba(X_train_base)[:, 1]
        y_validation_proba = modelo.predict_proba(X_test)[:, 1]

        # Accuracy
        acc_train = accuracy_score(y_train_base, y_train_pred)
        acc_test = accuracy_score(y_test, y_validation_pred)
        accuracy_scores_train.append(acc_train)
        accuracy_scores_test.append(acc_test)

        # Balanced Accuracy
        bal_acc_train = balanced_accuracy_score(y_train_base, y_train_pred)
        bal_acc_test = balanced_accuracy_score(y_test, y_validation_pred)
        balanced_accuracy_scores_train.append(bal_acc_train)
        balanced_accuracy_scores_test.append(bal_acc_test)

        # AUC ROC
        auc_train = roc_auc_score(y_train_base, y_train_proba)
        auc_test = roc_auc_score(y_test, y_validation_proba)
        auc_scores_train.append(auc_train)
        auc_scores_test.append(auc_test)
    
    # Calcular promedios y desviaciones estándar
    accuracy_train_mean = np.mean(accuracy_scores_train)
    accuracy_test_mean = np.mean(accuracy_scores_test)
    accuracy_train_std = np.std(accuracy_scores_train)
    accuracy_test_std = np.std(accuracy_scores_test)
    bal_accuracy_train_mean = np.mean(balanced_accuracy_scores_train)
    bal_accuracy_test_mean = np.mean(balanced_accuracy_scores_test)
    bal_accuracy_train_std = np.std(balanced_accuracy_scores_train)
    bal_accuracy_test_std = np.std(balanced_accuracy_scores_test)
    auc_train_mean = np.mean(auc_scores_train)
    auc_test_mean = np.mean(auc_scores_test)
    auc_train_std = np.std(auc_scores_train)
    auc_test_std = np.std(auc_scores_test)
    print(f"Resultados para {params}:")

    resultados.append({
        'params': params,
        'bootstrap': params['bootstrap'],
        'criterion': params['criterion'],
        'n_estimators': params['n_estimators'],
        'train_accuracy_mean': accuracy_train_mean,
        'test_accuracy_mean': accuracy_test_mean,
        'train_accuracy_std': accuracy_train_std,
        'test_accuracy_std': accuracy_test_std,
        'train_balanced_accuracy_mean': bal_accuracy_train_mean,
        'test_balanced_accuracy_mean': bal_accuracy_test_mean,
        'train_balanced_accuracy_std': bal_accuracy_train_std,
        'test_balanced_accuracy_std': bal_accuracy_test_std,
        'train_auc_mean': auc_train_mean,
        'test_auc_mean': auc_test_mean,
        'train_auc_std': auc_train_std,
        'test_auc_std': auc_test_std
    })

    # Calcular importancia promedio y std
    importancias_array = np.array(importancias_atributos)
    importancia_media = np.mean(importancias_array, axis=0)
    importancia_std = np.std(importancias_array, axis=0)

    # Asignar nombres de columnas
    columnas = dataset_gold.columns.difference(['target'])
    # Crear dict plano para el DataFrame de importancias
    importancia_flat = {
        f"mean_{col}": importancia_media[idx]
        for idx, col in enumerate(columnas)
    }
    importancia_flat.update({
        f"std_{col}": importancia_std[idx]
        for idx, col in enumerate(columnas)
    })
    importancia_flat.update(params)  # Podés incluir los hiperparámetros para referencia

    importancias_resultados.append(importancia_flat)

In [None]:
resultados = pd.DataFrame(resultados)
resultados

In [None]:
resultados.to_csv('../../assets/resultados_modelos/experimento_2_v2/resultados_random_forest.csv', index=False)

In [None]:
df_importancias = pd.DataFrame(importancias_resultados)

In [None]:
df_importancias

In [None]:
#veo las columnas que empiezan con mean
mean_columns = [col for col in df_importancias.columns if col.startswith('mean_')]
df_importancias[mean_columns]

In [None]:
df_importancias.to_csv('../../assets/resultados_modelos/experimento_2_v2/importancias_atributos.csv', index=False)

In [None]:
mean_columns = [col for col in df_importancias.columns if col.startswith('mean_')]

for idx, row in df_importancias.iterrows():
    importancias = row[mean_columns]
    columnas = [col.replace("mean_", "") for col in mean_columns]
    
    plt.figure(figsize=(12, 4))
    plt.bar(columnas, importancias)
    plt.xticks(rotation=45, ha='right')
    plt.ylabel("Importancia media")
    plt.title(f"Importancia de atributos - Configuración #{idx}")
    plt.tight_layout()
    plt.show()


In [None]:
mean_columns = [col for col in df_importancias.columns if col.startswith('mean_')]

for idx, row in df_importancias.iterrows():
    importancias = row[mean_columns]
    
    # Ordenar y quedarse con el top 20
    top_importancias = importancias.sort_values(ascending=False).head(20)
    columnas_top = [col.replace("mean_", "") for col in top_importancias.index]

    # Plot
    plt.figure(figsize=(12, 5))
    plt.bar(columnas_top, top_importancias.values)
    plt.xticks(rotation=45, ha='right')
    plt.ylabel("Importancia media")
    plt.title(f"Top 20 atributos - Configuración #{idx}")
    plt.tight_layout()
    plt.savefig(f'../../assets/resultados_modelos/experimento_2_v2/importancia_atributos_random_forest_sin_boostrap_{idx}_top_20.png')
    plt.show()
