In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import joblib

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix , auc, precision_recall_curve, roc_auc_score, balanced_accuracy_score
from scipy.stats import uniform, randint
from sklearn.model_selection import ParameterGrid, cross_validate

In [None]:
x_train = pd.read_csv('../../assets/gold/experimentos_finales/x_train.csv')
y_train = pd.read_csv('../../assets/gold/experimentos_finales/y_train.csv')

In [None]:
x_validation = pd.read_csv('../../assets/gold/experimentos_finales/x_test.csv')
y_validation = pd.read_csv('../../assets/gold/experimentos_finales/y_test.csv')

In [None]:
print(x_train.dtypes.to_markdown())

Vamos a hacer el primer experimento "real"

Objetivo: tener el modelo base y de acá queremos mejorar.
* Divido en TRAIN (80% de los datos) y VALIDATION (20% de los datos) respetando la distribución de la variable target: como lo hacemos una sola vez, nos arriesgamos a que la división sea mala <-- ya lo hice arriba

* Entrenamos 8 modelos Random Forest SIN bootstrap usando los datos de TRAIN. Siempre dejamos altura máxima 3
Los 8 modelos salen de probar las combinaciones:
5, 10, 15 y 20 árboles
"gini", "entropy"

* Presento resultados con los datos de VALIDATION: veo en general los valores de AUCROC, accuracy y balance accuracy. Sin embargo, también veo la variación entre los 5, 10 y 20 árboles del Random Forest
Hago gráfico de importancia de atributos tomando la métrica gini

Se corre el riesgo que ese 20 sea mala muestra y manche el experimento 1 --> volvemos a ver si el 2 combate estos problemas

In [None]:
max_arboles = [5, 10, 15, 20]
metricas = ["gini", "entropy"]
bootstrap = [False]
max_depth = [3]
min_samples_split= [2]
min_samples_leaf = [1]
max_features = ['sqrt']


def crear_grilla(max_arboles, metricas, bootstrap, min_samples_split, min_samples_leaf, max_features):
    parametros = {
        'n_estimators': max_arboles,
        'criterion': metricas,
        'bootstrap': bootstrap,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features
    }
    grilla = list(ParameterGrid(parametros))
    return grilla

grilla_parametros = crear_grilla(max_arboles, metricas, bootstrap,min_samples_split, min_samples_leaf, max_features)
grilla_parametros

In [None]:
def auc_roc_entre_arboles(modelo, x, y):
    # Lista para guardar los AUC ROC de cada árbol
    auc_scores = []

    # Iterar sobre cada árbol en el bosque
    for arbol in modelo.estimators_:
        proba = arbol.predict_proba(x.values)[:, 1]
        auc = roc_auc_score(y, proba)
        auc_scores.append(auc)

    # Convertir a numpy array para estadística
    auc_scores = np.array(auc_scores)

    # Resultados
    return auc_scores.mean(), auc_scores.std()

def accuracy_y_balance_accuracy_entre_arboles(modelo, x, y):
    accuracy_scores = []
    balanced_accuracy_scores = []

    for arbol in modelo.estimators_:
        pred = arbol.predict(x.values)
        acc = accuracy_score(y, pred)
        bal_acc = balanced_accuracy_score(y, pred)
        
        accuracy_scores.append(acc)
        balanced_accuracy_scores.append(bal_acc)

    # Convertir a numpy arrays
    accuracy_scores = np.array(accuracy_scores)
    balanced_accuracy_scores = np.array(balanced_accuracy_scores)

    # Resultados
    return accuracy_scores.mean(), accuracy_scores.std(), balanced_accuracy_scores.mean(), balanced_accuracy_scores.std()

def grafico_importancia_atributos(index, modelo, x_train, graf1, graf2):
    """
    Grafica la importancia de los atributos del modelo.
    
    Args:
        modelo: El modelo entrenado.
        x_train: Datos de entrenamiento.
    """
    importancias = pd.Series(modelo.feature_importances_, index=x_train.columns)
    
    plt.figure(figsize=(15, 6))  # ancho x alto en pulgadas
    importancias.sort_values(ascending=False).plot(kind='bar')
    plt.title(f"Importancia de atributos - Random Forest {index}")
    plt.ylabel("Importancia")
    plt.xlabel("Atributos")
    plt.xticks(rotation=90, ha='right')  # Rotar etiquetas para mayor claridad
    plt.tight_layout()
    plt.savefig(graf1)
    plt.show()

    top_n = 20
    importancias.sort_values(ascending=False).head(top_n).plot(kind='bar')
    plt.title(f"Top {top_n} Importancia de Atributos - Random Forest {index}")
    plt.ylabel("Importancia")
    plt.xlabel("Atributos")
    plt.xticks(rotation=45, ha='right')  # Rotar etiquetas para mayor claridad
    plt.tight_layout()
    plt.savefig(graf2)
    plt.show()

In [None]:
# Para guardar resultados
resultados = []

y_para_entrenar = y_train.squeeze()
y_validation_para_metricas = y_validation.squeeze()
i = 0

for params in grilla_parametros:
    # Cross-validation (por ejemplo, 5 folds estratificados)
    modelo = RandomForestClassifier(random_state=42, **params)

    ##### REVISAR PARAMETROS #####
    
    modelo.fit(x_train, y_para_entrenar)

    # Graficar la importancia de los atributos
    grafico_importancia_atributos(i, modelo, x_train, f'../../assets/resultados_modelos/experimento_1_v2/importancia_atributos_random_forest_sin_boostrap_{i}.png', f'../../assets/resultados_modelos/experimento_1_v2/importancia_atributos_random_forest_sin_boostrap_{i}_top_20.png')

    joblib.dump(modelo, f'../../assets/resultados_modelos/experimento_1_v2/random_forest_sin_boostrap_{i}.pkl')
    print(f"Modelo guardado {i}")
    i += 1
    

    # Predicciones de clase
    y_train_pred = modelo.predict(x_train)
    y_validation_pred = modelo.predict(x_validation)

    # Predicciones de probabilidad para AUC ROC (para la clase positiva)
    y_train_proba = modelo.predict_proba(x_train)[:, 1]
    y_validation_proba = modelo.predict_proba(x_validation)[:, 1]

    # Accuracy
    acc_train = accuracy_score(y_para_entrenar, y_train_pred)
    acc_test = accuracy_score(y_validation_para_metricas, y_validation_pred)

    # Balanced Accuracy
    bal_acc_train = balanced_accuracy_score(y_para_entrenar, y_train_pred)
    bal_acc_test = balanced_accuracy_score(y_validation_para_metricas, y_validation_pred)

    # AUC ROC
    auc_train = roc_auc_score(y_para_entrenar, y_train_proba)
    auc_test = roc_auc_score(y_validation_para_metricas, y_validation_proba)

    
    resultados.append({
        'params': params,
        'bootstrap': params['bootstrap'],
        'criterion': params['criterion'],
        'n_estimators': params['n_estimators'],
        'train_accuracy': acc_train,
        'test_accuracy': acc_test,
        'train_balanced_accuracy': bal_acc_train,
        'test_balanced_accuracy': bal_acc_test,
        'train_auc': auc_train,
        'test_auc': auc_test,
        'mean_accuracy_train': accuracy_y_balance_accuracy_entre_arboles(modelo, x_train, y_train)[0],
        'std_accuracy_train': accuracy_y_balance_accuracy_entre_arboles(modelo, x_train, y_train)[1],
        'mean_accuracy_test': accuracy_y_balance_accuracy_entre_arboles(modelo, x_validation, y_validation_para_metricas)[0],
        'std_accuracy_test': accuracy_y_balance_accuracy_entre_arboles(modelo, x_validation, y_validation_para_metricas)[1],
        'mean_balanced_accuracy_test': accuracy_y_balance_accuracy_entre_arboles(modelo, x_validation, y_validation_para_metricas)[2],
        'std_balanced_accuracy_test': accuracy_y_balance_accuracy_entre_arboles(modelo, x_validation, y_validation_para_metricas)[3],
        'mean_balanced_accuracy_train': accuracy_y_balance_accuracy_entre_arboles(modelo, x_train, y_train)[2],
        'std_balanced_accuracy_train': accuracy_y_balance_accuracy_entre_arboles(modelo, x_train, y_train)[3],
        'mean_auc_test': auc_roc_entre_arboles(modelo, x_validation, y_validation_para_metricas)[0],
        'std_auc_test': auc_roc_entre_arboles(modelo, x_validation, y_validation_para_metricas)[1],
        'mean_auc_train': auc_roc_entre_arboles(modelo, x_train, y_train)[0],
        'std_auc_train': auc_roc_entre_arboles(modelo, x_train, y_train)[1]   
    })

df_resultados = pd.DataFrame(resultados)
df_resultados.sort_values(by="test_auc", ascending=False)

In [None]:
df_resultados.to_csv('../../assets/resultados_modelos/experimento_1_v2/resultados_random_forest_sin_boostrap.csv', index=False)

In [None]:
df_resultados

### Parte B del experimento

Quiero ver esto mismo pero sacando los datos que "manchan" el resto de mis resultados

In [None]:
x_train_sin_importantes = x_train.drop(columns=['finales_inscriptos_1.0', 'finales_inscriptos_2.0', 'finales_inscriptos_3.0',
                                                'inscripciones_1.0', 'inscripciones_2.0', 'inscripciones_3.0',
                                                'nota_final_materia_10', 'nota_final_materia_2',
                                                'nota_final_materia_3', 'nota_final_materia_4', 'nota_final_materia_5',
                                                'nota_final_materia_6', 'nota_final_materia_7', 'nota_final_materia_8',
                                                'nota_final_materia_9','tp_aprobado_materia_10', 'tp_aprobado_materia_2',
                                                'tp_aprobado_materia_3', 'tp_aprobado_materia_4',
                                                'tp_aprobado_materia_5', 'tp_aprobado_materia_6',
                                                'tp_aprobado_materia_7', 'tp_aprobado_materia_8',
                                                'tp_aprobado_materia_9', 'tp_aprobado_materia_1', 'tp_aprobados_1.0',
                                                'tp_aprobados_2.0', 'tp_aprobados_3.0', 'tp_aprobados_0.0', 'finales_inscriptos_0.0', 'inscripciones_0.0', 'nota_final_materia_1'])
x_validation_sin_importantes = x_validation.drop(columns=['finales_inscriptos_1.0', 'finales_inscriptos_2.0', 'finales_inscriptos_3.0',
                                                'inscripciones_1.0', 'inscripciones_2.0', 'inscripciones_3.0',
                                                'nota_final_materia_10', 'nota_final_materia_2',
                                                'nota_final_materia_3', 'nota_final_materia_4', 'nota_final_materia_5',
                                                'nota_final_materia_6', 'nota_final_materia_7', 'nota_final_materia_8',
                                                'nota_final_materia_9','tp_aprobado_materia_10', 'tp_aprobado_materia_2',
                                                'tp_aprobado_materia_3', 'tp_aprobado_materia_4',
                                                'tp_aprobado_materia_5', 'tp_aprobado_materia_6',
                                                'tp_aprobado_materia_7', 'tp_aprobado_materia_8',
                                                'tp_aprobado_materia_9','tp_aprobado_materia_1','tp_aprobados_1.0',
                                                'tp_aprobados_2.0', 'tp_aprobados_3.0', 'tp_aprobados_0.0', 'finales_inscriptos_0.0', 'inscripciones_0.0', 'nota_final_materia_1'])

In [None]:
x_train_sin_importantes.columns[~x_train_sin_importantes.columns.isin(x_validation_sin_importantes.columns)]

In [None]:
x_validation_sin_importantes.columns

In [None]:
# Para guardar resultados
resultados = []

y_para_entrenar = y_train.squeeze()
y_validation_para_metricas = y_validation.squeeze()
i = 0

for params in grilla_parametros:
    # Cross-validation (por ejemplo, 5 folds estratificados)
    modelo = RandomForestClassifier(random_state=42, **params)

    ##### REVISAR PARAMETROS #####
    
    modelo.fit(x_train_sin_importantes, y_para_entrenar)

    # Graficar la importancia de los atributos
    grafico_importancia_atributos(i, modelo, x_train_sin_importantes, f'../../assets/resultados_modelos/experimento_1_b_solo_fechas/importancia_atributos_random_forest_sin_boostrap_{i}.png', f'../../assets/resultados_modelos/experimento_1_b_solo_fechas/importancia_atributos_random_forest_sin_boostrap_{i}_top_20.png')

    joblib.dump(modelo, f'../../assets/resultados_modelos/experimento_1_b_solo_fechas/random_forest_sin_boostrap_{i}.pkl')
    print(f"Modelo guardado {i}")
    i += 1
    

    # Predicciones de clase
    y_train_pred = modelo.predict(x_train_sin_importantes)
    y_validation_pred = modelo.predict(x_validation_sin_importantes)

    # Predicciones de probabilidad para AUC ROC (para la clase positiva)
    y_train_proba = modelo.predict_proba(x_train_sin_importantes)[:, 1]
    y_validation_proba = modelo.predict_proba(x_validation_sin_importantes)[:, 1]

    # Accuracy
    acc_train = accuracy_score(y_para_entrenar, y_train_pred)
    acc_test = accuracy_score(y_validation_para_metricas, y_validation_pred)

    # Balanced Accuracy
    bal_acc_train = balanced_accuracy_score(y_para_entrenar, y_train_pred)
    bal_acc_test = balanced_accuracy_score(y_validation_para_metricas, y_validation_pred)

    # AUC ROC
    auc_train = roc_auc_score(y_para_entrenar, y_train_proba)
    auc_test = roc_auc_score(y_validation_para_metricas, y_validation_proba)

    
    resultados.append({
        'params': params,
        'bootstrap': params['bootstrap'],
        'criterion': params['criterion'],
        'n_estimators': params['n_estimators'],
        'train_accuracy': acc_train,
        'test_accuracy': acc_test,
        'train_balanced_accuracy': bal_acc_train,
        'test_balanced_accuracy': bal_acc_test,
        'train_auc': auc_train,
        'test_auc': auc_test,
        'mean_accuracy_train': accuracy_y_balance_accuracy_entre_arboles(modelo, x_train_sin_importantes, y_train)[0],
        'std_accuracy_train': accuracy_y_balance_accuracy_entre_arboles(modelo, x_train_sin_importantes, y_train)[1],
        'mean_accuracy_test': accuracy_y_balance_accuracy_entre_arboles(modelo, x_validation_sin_importantes, y_validation_para_metricas)[0],
        'std_accuracy_test': accuracy_y_balance_accuracy_entre_arboles(modelo, x_validation_sin_importantes, y_validation_para_metricas)[1],
        'mean_balanced_accuracy_test': accuracy_y_balance_accuracy_entre_arboles(modelo, x_validation_sin_importantes, y_validation_para_metricas)[2],
        'std_balanced_accuracy_test': accuracy_y_balance_accuracy_entre_arboles(modelo, x_validation_sin_importantes, y_validation_para_metricas)[3],
        'mean_balanced_accuracy_train': accuracy_y_balance_accuracy_entre_arboles(modelo, x_train_sin_importantes, y_train)[2],
        'std_balanced_accuracy_train': accuracy_y_balance_accuracy_entre_arboles(modelo, x_train_sin_importantes, y_train)[3],
        'mean_auc_test': auc_roc_entre_arboles(modelo, x_validation_sin_importantes, y_validation_para_metricas)[0],
        'std_auc_test': auc_roc_entre_arboles(modelo, x_validation_sin_importantes, y_validation_para_metricas)[1],
        'mean_auc_train': auc_roc_entre_arboles(modelo, x_train_sin_importantes, y_train)[0],
        'std_auc_train': auc_roc_entre_arboles(modelo, x_train_sin_importantes, y_train)[1]   
    })

df_resultados_b = pd.DataFrame(resultados)

In [None]:
df_resultados_b

In [None]:
df_resultados_b.to_csv('../../assets/resultados_modelos/experimento_1_b_solo_fechas/resultados_random_forest_sin_boostrap_sin_importantes.csv', index=False)