# 03_Validación y Permutation Importance

En este notebook validamos el pipeline final usando Stratified K-Fold, buscamos el umbral óptimo de decisión y calculamos la importancia de features por permutación **dentro de cada fold** para obtener un ranking robusto.

## Setup e Imports

In [21]:
import os, sys
import yaml
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score
from sklearn.inspection import permutation_importance

# aseguramos src en el path
ROOT = os.path.abspath(os.path.join(".."))
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

from src.utils import load_config
from src.full_model_pipeline import make_full_model_pipeline

# ── Carga configuración y datos ────────────────────────────────
cfg = load_config(os.path.join(ROOT, "configs", "best_config_optuna_100.yaml"))
df = pd.read_csv(os.path.join(ROOT, cfg["data_raw_path"], cfg["train_file"]))

# mapear target
mapping = {"Cancelada": 1, "No Cancelada": 0}
y = df[cfg["target_column"]].map(mapping)
X = df.drop(columns=cfg["id_columns"] + [cfg["target_column"]])
print(f"Datos: X.shape={X.shape}, y.shape={y.shape}")


Datos: X.shape=(31922, 17), y.shape=(31922,)


## K-Fold CV y Umbral Óptimo

In [22]:
# Stratified K-Fold
n_splits = cfg.get("cv_n_splits", 5)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=cfg.get("random_seed",42))

oof_probs = np.zeros(len(X))
for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]
    
    pipe = make_full_model_pipeline(cfg)
    pipe.fit(X_tr, y_tr)
    oof_probs[va_idx] = pipe.predict_proba(X_va)[:,1]
    print(f" Fold {fold}/{n_splits} hecho.")

# Buscar umbral que maximiza F1 ponderado
best_thr, best_f1 = 0.5, 0
for thr in np.arange(0.1,0.9,0.01):
    f1 = f1_score(y, (oof_probs>=thr).astype(int), average="weighted")
    if f1>best_f1:
        best_f1, best_thr = f1, thr

print(f"\n→ Mejor F1 OOF (weighted): {best_f1:.4f} @ threshold={best_thr:.2f}")


Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
 Fold 1/5 hecho.
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
 Fold 2/5 hecho.
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
 Fold 3/5 hecho.
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin

## Permutation Importance con K-Fold

In [23]:
# En cada fold computamos permutation importance y luego promediamos
importances = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, y_tr = X.iloc[tr_idx], y.iloc[tr_idx]
    X_va, y_va = X.iloc[va_idx], y.iloc[va_idx]

    pipe = make_full_model_pipeline(cfg)
    pipe.fit(X_tr, y_tr)

    # extraer preprocesado + FE + encode
    preproc = pipe[:-1]
    X_va_proc = preproc.transform(X_va)

    # calcular permutational importance sobre el modelo entrenado
    r = permutation_importance(
        pipe.named_steps['model'],
        X_va_proc, y_va,
        n_repeats=5,
        random_state=42,
        scoring='f1_weighted'
    )
    importances.append(r.importances_mean)
    print(f" Permutation fold {fold} listo.")

# dataframe de importancias y promedio
imp_df = pd.DataFrame(importances, columns=X_va_proc.columns)
mean_imp = imp_df.mean().sort_values(ascending=False)

print("\n→ Top 20 Features (permutation importance):")
display(mean_imp.head(20))


Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
 Permutation fold 1 listo.
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
 Permutation fold 2 listo.
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
 Permutation fold 3 listo.
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_S

remainder__num_clean__Num_Solicitudes_Especiales         0.098208
remainder__num_clean__Tiempo_Antelación                  0.095903
target_enc__cat_clean__Segmento_Mercado                  0.039749
remainder__num_clean__Precio_Promedio_Por_Habitación     0.028141
remainder__remainder__Año_Llegada                        0.012418
remainder__Precio_por_Persona_Noche                      0.011205
remainder__num_clean__Num_Solicitudes_Especiales_miss    0.010813
remainder__Precio_Total_Estancia                         0.009896
remainder__remainder__Día_Llegada                        0.008966
target_enc__Antelacion_Bin                               0.008280
remainder__remainder__Mes_Llegada                        0.007879
remainder__Mes_cos                                       0.005556
remainder__remainder__Requiere_Estacionamiento           0.004520
remainder__Mes_sin                                       0.004059
remainder__Duracion_Total                                0.003823
target_enc

## Observaciones

1. **Umbral Óptimo:** Mejor F1 OOF (weighted): 0.8681 @ threshold=0.47.  
Vamos a quedarnos con las 12 features que tienen importancia ≥ 0.005 y eliminar las 8 que están por debajo:


In [None]:
# ── 1. poner la raíz en sys.path ──────────────────────────────
import os, sys
ROOT = os.path.abspath(os.path.join("..")) # Ajusta si es necesario
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

import numpy as np
import yaml, pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from src.full_model_pipeline import make_full_model_pipeline # Asumo que toma dict cfg
from src.utils import load_config
import copy

# ── 2. cargar la MEJOR config encontrada por Optuna ────────────
BEST_CONFIG_PATH = os.path.join(ROOT, "configs", "best_config_optuna_100.yaml") # <- USA EL NUEVO ARCHIVO
cfg_best = load_config(BEST_CONFIG_PATH)
print("Configuración Optimizada cargada.")

# ── 3. leer CSV y preparar datos (igual que antes) ──────────────
train_path = os.path.join(ROOT, cfg_best["data_raw_path"], cfg_best["train_file"])
df_train_full = pd.read_csv(train_path)

# Mapear target
mapping = {"Cancelada": 1, "No Cancelada": 0}
y_true_full = df_train_full[cfg_best["target_column"]].map(mapping)
if y_true_full.isna().any():
    raise ValueError(f"Target con NaNs tras mapping: {df_train_full[cfg_best['target_column']][y_true_full.isna()].unique()}")

X_full = df_train_full.drop(columns=cfg_best["id_columns"] + [cfg_best["target_column"]])
print(f"Datos preparados: X_full.shape={X_full.shape}, y_true_full.shape={y_true_full.shape}")

# ── 4. Ejecutar CV UNA VEZ para obtener OOF Probs y Umbral ───────

# --- Adaptación de la lógica de run_experiment ---
print("\n--- Ejecutando CV con la configuración optimizada para obtener OOF Probs ---")

# --- AJUSTE IMPORTANTE: Pasar config a make_full_model_pipeline ---
# Si make_full_model_pipeline espera RUTA:
TEMP_CONFIG_PATH_FINAL = os.path.join(ROOT, "configs", "temp_final_config.yaml")
with open(TEMP_CONFIG_PATH_FINAL, 'w') as f:
    yaml.dump(cfg_best, f)
pipe = make_full_model_pipeline(TEMP_CONFIG_PATH_FINAL)
# Si make_full_model_pipeline espera DICT:
# pipe = make_full_model_pipeline(cfg_best)
# --- Fin del ajuste ---

oof_probs = np.zeros(len(X_full)) # Para guardar probabilidades

# Usar random_state del config para reproducibilidad de folds
skf = StratifiedKFold(n_splits=cfg_best.get("cv_n_splits", 5), 
                      shuffle=True, 
                      random_state=cfg_best.get("random_seed", 42))

print("Iniciando validación cruzada...")
for fold, (tr_idx, va_idx) in enumerate(skf.split(X_full, y_true_full)):
    X_train, X_val = X_full.iloc[tr_idx], X_full.iloc[va_idx]
    y_train, y_val = y_true_full.iloc[tr_idx], y_true_full.iloc[va_idx]
    
    print(f"  Entrenando Fold {fold+1}/5...")
    # --- Fit Params (Early Stopping - Opcional pero recomendado si n_estimators es alto) ---
    # fit_params = {
    #     'model__eval_set': [(X_val, y_val)],
    #     'model__callbacks': [lgb.early_stopping(50, verbose=False)] # O el callback de pruning si se usa Optuna
    # }
    # pipe.fit(X_train, y_train, **fit_params)
    pipe.fit(X_train, y_train) # Sin early stopping por ahora si no está configurado
    # --- Fin Fit Params ---
    
    # Guardar probabilidades para optimización de umbral
    oof_probs[va_idx] = pipe.predict_proba(X_val)[:, 1] # Probabilidad de la clase positiva (1)

if os.path.exists(TEMP_CONFIG_PATH_FINAL): # Limpiar config temporal
    os.remove(TEMP_CONFIG_PATH_FINAL)
    
print("Validación cruzada completada. Calculando umbral óptimo...")

# Optimización de Umbral sobre todas las predicciones OOF
best_f1_weighted = 0
best_threshold_weighted = 0.5
best_f1_class1 = 0
best_threshold_class1 = 0.5
positive_class_value = 1 # Asumiendo que "Cancelada" es 1

for threshold_candidate in np.arange(0.1, 0.9, 0.01):
    oof_preds_at_threshold = (oof_probs >= threshold_candidate).astype(int)
    
    current_f1_weighted = f1_score(y_true_full, oof_preds_at_threshold, average="weighted")
    if current_f1_weighted > best_f1_weighted:
        best_f1_weighted = current_f1_weighted
        best_threshold_weighted = threshold_candidate
        
    current_f1_class1 = f1_score(y_true_full, oof_preds_at_threshold, pos_label=positive_class_value, average="binary")
    if current_f1_class1 > best_f1_class1:
        best_f1_class1 = current_f1_class1
        best_threshold_class1 = threshold_candidate

# --- Resultados Finales ---
print("\n--- Resultados con Configuración Optimizada por Optuna ---")
print(f"Mejor F1 OOF Ponderado (calculado aquí): {best_f1_weighted:.6f}")
print(f"Umbral Óptimo para F1 Ponderado: {best_threshold_weighted:.2f}")
print(f"\nMejor F1 OOF Clase Positiva ('Cancelada'): {best_f1_class1:.6f}")
print(f"Umbral Óptimo para F1 Clase Positiva: {best_threshold_class1:.2f}")

# Guarda este umbral (probablemente best_threshold_weighted) para usarlo en las predicciones finales
final_threshold = best_threshold_weighted 
print(f"\n==> Umbral final a usar para predicciones: {final_threshold:.2f}")

# 03 — Evaluación Final del Modelo

**Objetivos:**
- Calcular métricas OOF (Out‑Of‑Fold) sobre el conjunto completo de entrenamiento.
- Optimizar el umbral de clasificación para maximizar F1-score ponderado y F1 para la clase “Cancelada”.
- Validar estabilidad del modelo usando Repeated Stratified K‑Fold.
- Extraer e interpretar la importancia de features (LightGBM).

In [7]:
import os, sys
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, make_scorer
import yaml

# Para que nuestros pipelines devuelvan DataFrames con nombres
set_config(transform_output="pandas")

ROOT = os.path.abspath(os.path.join(".."))
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

from src.utils import load_config
from src.full_model_pipeline import make_full_model_pipeline

### Carga de datos y configuración óptima

In [9]:
BEST_CFG = os.path.join(ROOT, "configs", "best_config_optuna_100.yaml")
cfg = load_config(BEST_CFG)

# Cargar datos raw
df = pd.read_csv(os.path.join(ROOT, cfg['data_raw_path'], cfg['train_file']))

# 2.3 Separar X y y
mapping = {"Cancelada": 1, "No Cancelada": 0}

y = df[cfg["target_column"]].map(mapping)
if y.isna().any():
    raise ValueError("Encontrados valores inesperados en el target, revisar el mapeo")

X = df.drop(columns=cfg["id_columns"] + [cfg["target_column"]])

print(f"X.shape={X.shape}, y.shape={y.shape}")

X.shape=(31922, 17), y.shape=(31922,)


### 4: Construcción y vista previa del pipeline
Creamos el pipeline completo y visualizamos su estructura.

In [10]:
pipe = make_full_model_pipeline(cfg)
print(pipe)

Pipeline(steps=[('clean',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num_clean',
                                                  Pipeline(steps=[('imputer',
                                                                   NumericImputer(cols=['Num_Adultos',
                                                                                        'Num_Niños',
                                                                                        'Noches_Semana',
                                                                                        'Noches_Fin_Semana',
                                                                                        'Tiempo_Antelación',
                                                                                        'Precio_Promedio_Por_Habitación',
                                                                                        'Num_Solicitudes_Especiales']))]),
 

### OOF Predictions y Optimización de Umbral
1. Obtener probabilidades OOF con `StratifiedKFold(n_splits=5)`.
2. Explorar umbrales entre 0.1 y 0.9 para maximizar F1 ponderado y F1 clase positiva.

In [11]:
n = len(X)
oof_probs = np.zeros(n)
skf = StratifiedKFold(n_splits=cfg.get('cv_n_splits',5),
                      shuffle=True, random_state=cfg.get('random_seed',42))

for tr, va in skf.split(X, y):
    pipe.fit(X.iloc[tr], y.iloc[tr])
    oof_probs[va] = pipe.predict_proba(X.iloc[va])[:,1]

# Optimización de umbral
best_f1, best_thr = 0, 0.5
for thr in np.arange(0.1,0.9,0.01):
    preds = (oof_probs >= thr).astype(int)
    f1w = f1_score(y, preds, average='weighted')
    if f1w > best_f1:
        best_f1, best_thr = f1w, thr

print(f"Mejor F1 ponderado OOF: {best_f1:.4f} @ thr={best_thr:.2f}")

Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Nu

### Validación Repetida para Estabilidad
Usamos `RepeatedStratifiedKFold` para medir la varianza de nuestro F1.

In [12]:
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
scorer = make_scorer(f1_score, average='weighted')
scores = cross_val_score(pipe, X, y,
                         cv=rskf, scoring=scorer, n_jobs=-1)
print(f"F1 weighted: {scores.mean():.4f} ± {scores.std():.4f}")

Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Nu

### Importancia de Features (LightGBM)
Entrenar en todo el conjunto y extraer `feature_importances_`.

In [14]:
# 1) Ajusta todo el pipeline
pipe.fit(X, y)

# 2) Extrae el modelo
model = pipe.named_steps['model']

# 3) Transforma X con todo el pipeline excepto el paso 'model'
#    (pipe[:-1] son todos los pasos menos el último)
preprocessor = pipe[:-1]
X_transformed = preprocessor.transform(X)

# 4) Ahora X_transformed es un DataFrame (porque usaste set_config(transform_output="pandas"))
feature_names = X_transformed.columns

# 5) Construye tu Serie de importancias con esos nombres
import pandas as pd

importances = pd.Series(model.feature_importances_, index=feature_names)
display(importances.sort_values(ascending=False).head(20))

Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']


remainder__num_clean__Tiempo_Antelación                  9488
remainder__Precio_Total_Estancia                         7573
remainder__num_clean__Precio_Promedio_Por_Habitación     7200
remainder__Precio_por_Persona_Noche                      6600
remainder__remainder__Día_Llegada                        6575
remainder__remainder__Mes_Llegada                        2730
remainder__Duracion_Total                                2088
remainder__Mes_sin                                       1897
remainder__Mes_cos                                       1873
remainder__num_clean__Num_Solicitudes_Especiales         1669
remainder__Total_Huespedes                                665
onehot__cat_clean__Segmento_Mercado_Offline               565
remainder__remainder__Año_Llegada                         558
onehot__cat_clean__Segmento_Mercado_Online                487
remainder__num_clean__Num_Solicitudes_Especiales_miss     443
onehot__Season_Bucket_Invierno                            395
remainde

## 4. Permutation Importance

Aquí vamos a:
1. Dividir el conjunto completo en un hold-out (por ejemplo 80/20 estratificado).
2. Ajustar el pipeline final (cleaning → FE → encoding → modelo) en el 80%.
3. Calcular `permutation_importance` sobre el 20% de validación usando `scoring='f1_weighted'`.
4. Mostrar las 20 features con mayor importancia promedio.


In [16]:
import os, sys
import numpy as np
import pandas as pd
import yaml
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, make_scorer
from sklearn.inspection import permutation_importance

# --- Asegurar que 'src' está en sys.path ---
ROOT = os.path.abspath(os.path.join(".."))
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

from src.utils import load_config
from src.full_model_pipeline import make_full_model_pipeline

# 1.1 Cargamos la mejor configuración
BEST_CONFIG_PATH = os.path.join(ROOT, "configs", "best_config_optuna_100.yaml")
cfg = load_config(BEST_CONFIG_PATH)

# 1.2 Leemos todo el train.csv
df_train_full = pd.read_csv(os.path.join(ROOT, cfg["data_raw_path"], cfg["train_file"]))

# 1.3 Mapeamos target a 0/1
mapping = {"Cancelada": 1, "No Cancelada": 0}
y_true_full = df_train_full[cfg["target_column"]].map(mapping)
if y_true_full.isna().any():
    raise ValueError("Hay valores no mapeados en el target")

# 1.4 Sacamos X_full (quitamos id_columns + target_column)
X_full = df_train_full.drop(columns=cfg["id_columns"] + [cfg["target_column"]])

# 1.5 Construimos el pipeline completo con esa config
pipe = make_full_model_pipeline(cfg)


In [17]:
# 2.1 Creamos el hold-out (20% estratificado)
X_train_hi, X_val_hi, y_train_hi, y_val_hi = train_test_split(
    X_full,
    y_true_full,
    test_size=0.2,
    random_state=42,
    stratify=y_true_full
)

# 2.2 Entrenamos el pipeline sobre el 80%
pipe.fit(X_train_hi, y_train_hi)

# 2.3 Calculamos la importancia permutacional
result = permutation_importance(
    pipe,
    X_val_hi,
    y_val_hi,
    scoring=make_scorer(f1_score, average='weighted'),
    n_repeats=10,
    random_state=42,
    n_jobs=-1
)

# 2.4 Mostramos las 20 features más sensibles
importances = pd.Series(result.importances_mean, index=X_val_hi.columns)
top20 = importances.sort_values(ascending=False).head(20)
display(top20)


Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Nu

Tiempo_Antelación                     0.151652
Num_Solicitudes_Especiales            0.100900
Precio_Promedio_Por_Habitación        0.066883
Segmento_Mercado                      0.049269
Mes_Llegada                           0.045362
Noches_Semana                         0.018356
Num_Adultos                           0.018088
Año_Llegada                           0.014045
Noches_Fin_Semana                     0.013118
Día_Llegada                           0.012530
Num_Niños                             0.004043
Requiere_Estacionamiento              0.003570
Tipo_Plan_Comidas                     0.002817
Tipo_Habitación_Reservada             0.002396
Huésped_Recurrente                    0.001140
Num_Reservas_Previas_No_Canceladas    0.000687
Num_Cancelaciones_Previas            -0.000034
dtype: float64

In [20]:
from sklearn.inspection import permutation_importance
from src.full_model_pipeline import make_full_model_pipeline

# 1) Divide hold-out
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(
    X_full, y_true_full, test_size=0.2, 
    random_state=42, stratify=y_true_full
)

# 2) Entrena tu pipeline completo
pipe = make_full_model_pipeline(cfg)
pipe.fit(X_tr, y_tr)

# 3) Obtén X_val transformado (sin el paso 'model')
preproc = pipe[:-1]          # limpia+FE+encode
X_val_proc = preproc.transform(X_val)

# 4) Cálcula permutation importance sobre el modelo “limpio”
r = permutation_importance(
    pipe.named_steps['model'],    # tu LGBM entrenado
    X_val_proc, y_val,
    n_repeats=10,
    random_state=42,
    scoring='f1_weighted'
)

importances = pd.Series(r.importances_mean, index=X_val_proc.columns)
display(importances.sort_values(ascending=False).head(20))


Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']
Dropped columns after FE: ['num_clean__Num_Adultos', 'num_clean__Num_Niños', 'num_clean__Noches_Semana', 'num_clean__Noches_Fin_Semana']


remainder__num_clean__Tiempo_Antelación                  0.133444
remainder__num_clean__Num_Solicitudes_Especiales         0.101946
remainder__num_clean__Precio_Promedio_Por_Habitación     0.039574
onehot__cat_clean__Segmento_Mercado_Online               0.018995
remainder__Precio_Total_Estancia                         0.014405
remainder__remainder__Año_Llegada                        0.014045
onehot__cat_clean__Segmento_Mercado_Offline              0.012923
remainder__remainder__Día_Llegada                        0.012530
remainder__Precio_por_Persona_Noche                      0.011498
remainder__num_clean__Num_Solicitudes_Especiales_miss    0.010944
remainder__remainder__Mes_Llegada                        0.007755
onehot__Season_Bucket_Invierno                           0.006937
remainder__Mes_cos                                       0.006224
remainder__Total_Huespedes                               0.003953
remainder__Duracion_Total                                0.003933
remainder_