In [24]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, roc_auc_score, classification_report
)

import mlflow
import mlflow.sklearn

import matplotlib.pyplot as plt

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", 100)


In [25]:
import mlflow
mlflow.end_run()


In [26]:
# Ruta al CSV desde la carpeta del notebook
data_path = "../data/listings_clean_con_recomendable.csv"

df = pd.read_csv(data_path)

df.shape, df.head()


((20233, 14),
        id   price  price_per_guest  accommodates        room_type  \
 0   35797  3799.0        1899.5000           2.0  Entire home/apt   
 1   56074   585.0         292.5000           2.0  Entire home/apt   
 2   67703  1696.0         424.0000           4.0  Entire home/apt   
 3   70644  1004.0         502.0000           2.0  Entire home/apt   
 4  165772  4071.0         254.4375          16.0  Entire home/apt   
 
   neighbourhood_cleansed  minimum_nights  availability_365  \
 0  Cuajimalpa de Morelos             1.0             364.0   
 1             Cuauhtémoc            15.0             338.0   
 2             Cuauhtémoc             2.0             267.0   
 3               Coyoacán             3.0             211.0   
 4         Miguel Hidalgo             2.0             177.0   
 
    estimated_occupancy_l365d  estimated_revenue_l365d host_is_superhost  \
 0                        0.0                      0.0                 f   
 1                       30.0   

In [27]:
df = df.dropna().copy()
print("Filas después de eliminar NA:", df.shape[0])

# Superhost como 0/1
df["host_is_superhost"] = (df["host_is_superhost"] == "t").astype(int)

# Target de REGRESIÓN: log del precio
df["price_log"] = np.log1p(df["price"])
target_reg = "price_log"

# Target de CLASIFICACIÓN: columna 'recomendable' del dataset
target_clf = "recomendable"
df[target_clf] = df[target_clf].astype(int)  

print("Targets definidos: ", target_reg, "y", target_clf)



Filas después de eliminar NA: 16521
Targets definidos:  price_log y recomendable


In [28]:
# ===== Definir matriz de features común y targets =====

cols_a_excluir = [
    "id",
    "price",        # target original de regresión
    "price_log",    # target transformado de regresión
    "recomendable", # target de clasificación
]

feature_cols = [c for c in df.columns if c not in cols_a_excluir]

X = df[feature_cols].copy()
y_reg = df[target_reg].copy()
y_clf = df[target_clf].copy()

print("Columnas disponibles en X:", feature_cols)
print("Shape X:", X.shape)
print("Shape y_reg:", y_reg.shape)
print("Shape y_clf:", y_clf.shape)



Columnas disponibles en X: ['price_per_guest', 'accommodates', 'room_type', 'neighbourhood_cleansed', 'minimum_nights', 'availability_365', 'estimated_occupancy_l365d', 'estimated_revenue_l365d', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate']
Shape X: (16521, 11)
Shape y_reg: (16521,)
Shape y_clf: (16521,)


In [29]:
# ===== Features y preprocesamiento =====
# ---- REGRESIÓN: precio basado en rasgos del hosting.
features_reg = [
    "accommodates",
    "room_type",
    "neighbourhood_cleansed",
    "minimum_nights",
    "availability_365",
    "host_is_superhost",
    "host_response_rate",
    "host_acceptance_rate",
]

numeric_features_reg = [
    "accommodates",
    "minimum_nights",
    "availability_365",
    "host_is_superhost",
    "host_response_rate",
    "host_acceptance_rate",
]

categorical_features_reg = [
    "room_type",
    "neighbourhood_cleansed",
]

preprocessor_reg = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features_reg),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features_reg),
    ]
)

# ---- CLASIFICACIÓN: recomendación basada en desempeño histórico
#      (ocupación/ingresos) + algunos rasgos del hosting.

features_clf = [
    "room_type",
    "neighbourhood_cleansed",
    "availability_365",
    "estimated_occupancy_l365d",
    "estimated_revenue_l365d",
    "host_is_superhost",
    "host_response_rate",
    "host_acceptance_rate",
]

numeric_features_clf = [
    "availability_365",
    "estimated_occupancy_l365d",
    "estimated_revenue_l365d",
    "host_is_superhost",
    "host_response_rate",
    "host_acceptance_rate",
]

categorical_features_clf = [
    "room_type",
    "neighbourhood_cleansed",
]

preprocessor_clf = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features_clf),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features_clf),
    ]
)

print("Features REGRESIÓN:", features_reg)
print("Features CLASIFICACIÓN:", features_clf)


Features REGRESIÓN: ['accommodates', 'room_type', 'neighbourhood_cleansed', 'minimum_nights', 'availability_365', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate']
Features CLASIFICACIÓN: ['room_type', 'neighbourhood_cleansed', 'availability_365', 'estimated_occupancy_l365d', 'estimated_revenue_l365d', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate']


In [30]:
X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X,
    y_reg,
    y_clf,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y_clf  # estratificar por la variable de clasificación
)

X_train.shape, X_test.shape


((13216, 11), (3305, 11))

In [31]:
# ===== Modelo base de REGRESIÓN (price_log) =====

regressor_base = Pipeline(steps=[
    ("preprocess", preprocessor_reg),
    ("model", MLPRegressor(
        hidden_layer_sizes=(64, 32),  # 2 capas ocultas
        max_iter=200,
        random_state=RANDOM_STATE,
        early_stopping=True,
    )),
])

mlflow.sklearn.autolog()
mlflow.set_experiment("etapa4b_andes_regresion")

regressor_base.fit(X_train, y_reg_train)

# Predicción en escala log
y_reg_pred_log = regressor_base.predict(X_test)

# Volver a precio original para interpretar errores
y_reg_test_price = np.expm1(y_reg_test)
y_reg_pred_price = np.expm1(y_reg_pred_log)

mae = mean_absolute_error(y_reg_test_price, y_reg_pred_price)
mse = mean_squared_error(y_reg_test_price, y_reg_pred_price)
rmse = np.sqrt(mse)
r2 = r2_score(y_reg_test, y_reg_pred_log)

print("===== Resultados REGRESIÓN (modelo base) =====")
print("MAE (precio):", mae)
print("RMSE (precio):", rmse)
print("R2 (sobre log(price)):", r2)


2025/12/02 07:24:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1499c57716ff44558fa24ab96b99f7f0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


===== Resultados REGRESIÓN (modelo base) =====
MAE (precio): 523.288030103396
RMSE (precio): 948.3780021339622
R2 (sobre log(price)): 0.5222143200799635


In [None]:
# === Guardar modelo de regresión 4B (Andes – MLP con pipeline) ===
import joblib

# Guardar el pipeline completo (preprocesamiento + MLPRegressor)
joblib.dump(regressor_base, "modelo_regresion_andes.pkl")

# Guardar las columnas EXACTAS usadas en el entrenamiento
joblib.dump(X_train.columns.tolist(), "columnas_regresion_andes.pkl")

print("Guardados:")
print(" - modelo_regresion_andes.pkl")
print(" - columnas_regresion_andes.pkl")

In [32]:
# ===== Modelo base de CLASIFICACIÓN (recomendable) =====

classifier_base = Pipeline(steps=[
    ("preprocess", preprocessor_clf),
    ("model", MLPClassifier(
        hidden_layer_sizes=(64, 32),
        max_iter=200,
        random_state=RANDOM_STATE,
        early_stopping=True,
    )),
])

mlflow.sklearn.autolog()
mlflow.set_experiment("etapa4b_andes_clasificacion")

classifier_base.fit(X_train, y_clf_train)

# Probabilidades de ser recomendable (1)
y_clf_proba = classifier_base.predict_proba(X_test)[:, 1]
y_clf_pred = (y_clf_proba >= 0.5).astype(int)

acc = accuracy_score(y_clf_test, y_clf_pred)
auc = roc_auc_score(y_clf_test, y_clf_proba)

print("===== Resultados CLASIFICACIÓN (modelo base) =====")
print("Accuracy:", acc)
print("ROC-AUC:", auc)
print("\nReporte de clasificación:\n")
print(classification_report(y_clf_test, y_clf_pred))


2025/12/02 07:25:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'db582ab20b90483cad411ab51b8d597a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


===== Resultados CLASIFICACIÓN (modelo base) =====
Accuracy: 0.7319213313161876
ROC-AUC: 0.806878036335966

Reporte de clasificación:

              precision    recall  f1-score   support

           0       0.77      0.64      0.70      1602
           1       0.71      0.81      0.76      1703

    accuracy                           0.73      3305
   macro avg       0.74      0.73      0.73      3305
weighted avg       0.74      0.73      0.73      3305



In [33]:
from sklearn.model_selection import RandomizedSearchCV

# ===== Búsqueda de hiperparámetros - REGRESIÓN =====

param_dist_reg = {
    "model__hidden_layer_sizes": [
        (32, 16),
        (64, 32),
        (128, 64),
        (64, 64, 32),
    ],
    "model__alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "model__learning_rate_init": [1e-4, 5e-4, 1e-3],
    "model__batch_size": [32, 64, 128],
}

regressor_tuning = Pipeline(steps=[
    ("preprocess", preprocessor_reg),
    ("model", MLPRegressor(
        max_iter=200,
        random_state=RANDOM_STATE,
        early_stopping=True,
    )),
])

mlflow.sklearn.autolog()
mlflow.set_experiment("etapa4b_andes_regresion")

random_search_reg = RandomizedSearchCV(
    regressor_tuning,
    param_distributions=param_dist_reg,
    n_iter=10,
    scoring="neg_mean_squared_error",
    cv=3,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=2,
)

random_search_reg.fit(X_train, y_reg_train)

best_reg_model = random_search_reg.best_estimator_
print("Mejores hiperparámetros (regresión):")
print(random_search_reg.best_params_)

# Evaluar modelo ajustado en test
y_reg_pred_log_best = best_reg_model.predict(X_test)
y_reg_pred_price_best = np.expm1(y_reg_pred_log_best)
y_reg_test_price = np.expm1(y_reg_test)

mae_best = mean_absolute_error(y_reg_test_price, y_reg_pred_price_best)
mse_best = mean_squared_error(y_reg_test_price, y_reg_pred_price_best)
rmse_best = np.sqrt(mse_best)
r2_best = r2_score(y_reg_test, y_reg_pred_log_best)

print("\n===== Resultados REGRESIÓN (modelo ajustado) =====")
print("MAE (precio):", mae_best)
print("RMSE (precio):", rmse_best)
print("R2 (sobre log(price)):", r2_best)


2025/12/02 07:25:17 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'edef663f49d4422b89427a47bbae4c29', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 3 folds for each of 10 candidates, totalling 30 fits


2025/12/02 07:26:59 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


Mejores hiperparámetros (regresión):
{'model__learning_rate_init': 0.0001, 'model__hidden_layer_sizes': (64, 64, 32), 'model__batch_size': 32, 'model__alpha': 0.01}

===== Resultados REGRESIÓN (modelo ajustado) =====
MAE (precio): 525.8263173478442
RMSE (precio): 989.5974740288053
R2 (sobre log(price)): 0.5234282815392212


In [None]:
# === Guardar modelo de clasificación 4B (Andes – MLP con pipeline) ===
import joblib

# Guardar el pipeline completo (preprocesamiento + MLPClassifier)
joblib.dump(classifier_base, "modelo_clasificacion_andes.pkl")

# Guardar las columnas EXACTAS usadas en el entrenamiento
joblib.dump(X_train.columns.tolist(), "columnas_clasificacion_andes.pkl")

print("Guardados:")
print(" - modelo_clasificacion_andes.pkl")
print(" - columnas_clasificacion_andes.pkl")


In [34]:
import mlflow
mlflow.end_run()


In [35]:
from sklearn.model_selection import RandomizedSearchCV

# ===== Búsqueda de hiperparámetros - CLASIFICACIÓN =====

param_dist_clf = {
    "model__hidden_layer_sizes": [
        (32, 16),
        (64, 32),
        (128, 64),
        (64, 64, 32),
    ],
    "model__alpha": [1e-5, 1e-4, 1e-3, 1e-2],
    "model__learning_rate_init": [1e-4, 5e-4, 1e-3],
    "model__batch_size": [32, 64, 128],
}

clf_tuning = Pipeline(steps=[
    ("preprocess", preprocessor_clf),
    ("model", MLPClassifier(
        max_iter=200,
        random_state=RANDOM_STATE,
        early_stopping=True,
    )),
])

mlflow.sklearn.autolog()
mlflow.set_experiment("etapa4b_andes_clasificacion")

random_search_clf = RandomizedSearchCV(
    clf_tuning,
    param_distributions=param_dist_clf,
    n_iter=10,
    scoring="roc_auc",
    cv=3,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=2,
)

random_search_clf.fit(X_train, y_clf_train)

best_clf_model = random_search_clf.best_estimator_
print("Mejores hiperparámetros (clasificación):")
print(random_search_clf.best_params_)

y_clf_proba_best = best_clf_model.predict_proba(X_test)[:, 1]
y_clf_pred_best = (y_clf_proba_best >= 0.5).astype(int)

acc_best = accuracy_score(y_clf_test, y_clf_pred_best)
auc_best = roc_auc_score(y_clf_test, y_clf_proba_best)

print("\n===== Resultados CLASIFICACIÓN (modelo ajustado) =====")
print("Accuracy:", acc_best)
print("ROC-AUC:", auc_best)
print("\nReporte de clasificación (modelo ajustado):\n")
print(classification_report(y_clf_test, y_clf_pred_best))


2025/12/02 07:27:00 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '96c0f3d5af314143852a3379da113a2a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 3 folds for each of 10 candidates, totalling 30 fits


2025/12/02 07:28:03 INFO mlflow.sklearn.utils: Logging the 5 best runs, 5 runs will be omitted.


Mejores hiperparámetros (clasificación):
{'model__learning_rate_init': 0.0005, 'model__hidden_layer_sizes': (64, 64, 32), 'model__batch_size': 32, 'model__alpha': 0.001}

===== Resultados CLASIFICACIÓN (modelo ajustado) =====
Accuracy: 0.7252647503782148
ROC-AUC: 0.8082012501988487

Reporte de clasificación (modelo ajustado):

              precision    recall  f1-score   support

           0       0.72      0.70      0.71      1602
           1       0.73      0.75      0.74      1703

    accuracy                           0.73      3305
   macro avg       0.73      0.72      0.72      3305
weighted avg       0.73      0.73      0.73      3305



In [36]:
# Resumen de métricas clave de la Etapa 4b

resumen_resultados = []


resumen_resultados.append({
    "modelo": "Regresión - base",
    "tipo": "regresión",
    "MAE": mae,
    "RMSE": rmse,
    "R2_log_price": r2,
    "Accuracy": None,
    "ROC_AUC": None,
})

resumen_resultados.append({
    "modelo": "Regresión - ajustado",
    "tipo": "regresión",
    "MAE": mae_best,
    "RMSE": rmse_best,
    "R2_log_price": r2_best,
    "Accuracy": None,
    "ROC_AUC": None,
})

resumen_resultados.append({
    "modelo": "Clasificación - base",
    "tipo": "clasificación",
    "MAE": None,
    "RMSE": None,
    "R2_log_price": None,
    "Accuracy": acc,
    "ROC_AUC": auc,
})

resumen_resultados.append({
    "modelo": "Clasificación - ajustado",
    "tipo": "clasificación",
    "MAE": None,
    "RMSE": None,
    "R2_log_price": None,
    "Accuracy": acc_best,
    "ROC_AUC": auc_best,
})

resumen_df = pd.DataFrame(resumen_resultados)
resumen_df


Unnamed: 0,modelo,tipo,MAE,RMSE,R2_log_price,Accuracy,ROC_AUC
0,Regresión - base,regresión,523.28803,948.378002,0.522214,,
1,Regresión - ajustado,regresión,525.826317,989.597474,0.523428,,
2,Clasificación - base,clasificación,,,,0.731921,0.806878
3,Clasificación - ajustado,clasificación,,,,0.725265,0.808201
