In [10]:
# ── 1. poner la raíz en sys.path ──────────────────────────────
import os, sys
ROOT = os.path.abspath(os.path.join(".."))
sys.path.insert(0, ROOT)

import numpy as np
import yaml, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from src.full_model_pipeline import make_full_model_pipeline
from src.utils import load_config

# ── 2. cargar config ──────────────────────────────────────────
CONFIG_PATH = os.path.join(ROOT, "configs", "base_config.yaml")
cfg = load_config(CONFIG_PATH)

# ── 3. leer CSV con ruta absoluta ────────────────────────────
train_path = os.path.join(ROOT, cfg["data_raw_path"], cfg["train_file"])
df = pd.read_csv(train_path)

# ── 3.1 mapear target de strings a 0/1 ───────────────────────
mapping = {
    "Cancelada":     1,
    "No Cancelada":  0
}
y = df[cfg["target_column"]].map(mapping)
if y.isna().any():
    raise ValueError("Se detectaron valores de target fuera del mapping: "
                     f"{y[y.isna()].unique()}")

X = df.drop(columns=cfg["id_columns"] + [cfg["target_column"]])

# ── 4. crear pipeline y evaluar ───────────────────────────────
pipe = make_full_model_pipeline(CONFIG_PATH)

oof = np.zeros(len(X), dtype=int)
skf = StratifiedKFold(5, shuffle=True, random_state=42)

for tr_idx, va_idx in skf.split(X, y):
    pipe.fit(X.iloc[tr_idx], y.iloc[tr_idx])
    oof[va_idx] = pipe.predict(X.iloc[va_idx])

print("Weighted F1 OOF:", f1_score(y, oof, average="weighted"))


TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'

In [None]:
# ── 1. poner la raíz en sys.path ──────────────────────────────
import os, sys
ROOT = os.path.abspath(os.path.join("..")) # Ajusta si es necesario
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

import numpy as np
import yaml, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from src.full_model_pipeline import make_full_model_pipeline # Asumo que toma dict cfg
from src.utils import load_config
import copy

# ── 2. cargar la MEJOR config encontrada por Optuna ────────────
BEST_CONFIG_PATH = os.path.join(ROOT, "configs", "best_config_optuna_100.yaml") # <- USA EL NUEVO ARCHIVO
cfg_best = load_config(BEST_CONFIG_PATH)
print("Configuración Optimizada cargada.")

# ── 3. leer CSV y preparar datos (igual que antes) ──────────────
train_path = os.path.join(ROOT, cfg_best["data_raw_path"], cfg_best["train_file"])
df_train_full = pd.read_csv(train_path)

# Mapear target
mapping = {"Cancelada": 1, "No Cancelada": 0}
y_true_full = df_train_full[cfg_best["target_column"]].map(mapping)
if y_true_full.isna().any():
    raise ValueError(f"Target con NaNs tras mapping: {df_train_full[cfg_best['target_column']][y_true_full.isna()].unique()}")

X_full = df_train_full.drop(columns=cfg_best["id_columns"] + [cfg_best["target_column"]])
print(f"Datos preparados: X_full.shape={X_full.shape}, y_true_full.shape={y_true_full.shape}")

# ── 4. Ejecutar CV UNA VEZ para obtener OOF Probs y Umbral ───────

# --- Adaptación de la lógica de run_experiment ---
print("\n--- Ejecutando CV con la configuración optimizada para obtener OOF Probs ---")

# --- AJUSTE IMPORTANTE: Pasar config a make_full_model_pipeline ---
# Si make_full_model_pipeline espera RUTA:
TEMP_CONFIG_PATH_FINAL = os.path.join(ROOT, "configs", "temp_final_config.yaml")
with open(TEMP_CONFIG_PATH_FINAL, 'w') as f:
    yaml.dump(cfg_best, f)
pipe = make_full_model_pipeline(TEMP_CONFIG_PATH_FINAL)
# Si make_full_model_pipeline espera DICT:
# pipe = make_full_model_pipeline(cfg_best)
# --- Fin del ajuste ---

oof_probs = np.zeros(len(X_full)) # Para guardar probabilidades

# Usar random_state del config para reproducibilidad de folds
skf = StratifiedKFold(n_splits=cfg_best.get("cv_n_splits", 5), 
                      shuffle=True, 
                      random_state=cfg_best.get("random_seed", 42))

print("Iniciando validación cruzada...")
for fold, (tr_idx, va_idx) in enumerate(skf.split(X_full, y_true_full)):
    X_train, X_val = X_full.iloc[tr_idx], X_full.iloc[va_idx]
    y_train, y_val = y_true_full.iloc[tr_idx], y_true_full.iloc[va_idx]
    
    print(f"  Entrenando Fold {fold+1}/5...")
    # --- Fit Params (Early Stopping - Opcional pero recomendado si n_estimators es alto) ---
    # fit_params = {
    #     'model__eval_set': [(X_val, y_val)],
    #     'model__callbacks': [lgb.early_stopping(50, verbose=False)] # O el callback de pruning si se usa Optuna
    # }
    # pipe.fit(X_train, y_train, **fit_params)
    pipe.fit(X_train, y_train) # Sin early stopping por ahora si no está configurado
    # --- Fin Fit Params ---
    
    # Guardar probabilidades para optimización de umbral
    oof_probs[va_idx] = pipe.predict_proba(X_val)[:, 1] # Probabilidad de la clase positiva (1)

if os.path.exists(TEMP_CONFIG_PATH_FINAL): # Limpiar config temporal
    os.remove(TEMP_CONFIG_PATH_FINAL)
    
print("Validación cruzada completada. Calculando umbral óptimo...")

# Optimización de Umbral sobre todas las predicciones OOF
best_f1_weighted = 0
best_threshold_weighted = 0.5
best_f1_class1 = 0
best_threshold_class1 = 0.5
positive_class_value = 1 # Asumiendo que "Cancelada" es 1

for threshold_candidate in np.arange(0.1, 0.9, 0.01):
    oof_preds_at_threshold = (oof_probs >= threshold_candidate).astype(int)
    
    current_f1_weighted = f1_score(y_true_full, oof_preds_at_threshold, average="weighted")
    if current_f1_weighted > best_f1_weighted:
        best_f1_weighted = current_f1_weighted
        best_threshold_weighted = threshold_candidate
        
    current_f1_class1 = f1_score(y_true_full, oof_preds_at_threshold, pos_label=positive_class_value, average="binary")
    if current_f1_class1 > best_f1_class1:
        best_f1_class1 = current_f1_class1
        best_threshold_class1 = threshold_candidate

# --- Resultados Finales ---
print("\n--- Resultados con Configuración Optimizada por Optuna ---")
print(f"Mejor F1 OOF Ponderado (calculado aquí): {best_f1_weighted:.6f}")
print(f"Umbral Óptimo para F1 Ponderado: {best_threshold_weighted:.2f}")
print(f"\nMejor F1 OOF Clase Positiva ('Cancelada'): {best_f1_class1:.6f}")
print(f"Umbral Óptimo para F1 Clase Positiva: {best_threshold_class1:.2f}")

# Guarda este umbral (probablemente best_threshold_weighted) para usarlo en las predicciones finales
final_threshold = best_threshold_weighted 
print(f"\n==> Umbral final a usar para predicciones: {final_threshold:.2f}")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ricardomelendez/Documents/hotel-cancellation-prediction/configs/best_config_optuna.yaml'