In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, roc_auc_score, classification_report
)

import mlflow
import mlflow.sklearn

import matplotlib.pyplot as plt

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

pd.set_option("display.max_columns", 100)


In [2]:
# Ruta al CSV desde la carpeta del notebook
data_path = "../data/listings_clean_core_eda.csv"

df = pd.read_csv(data_path)

df.shape, df.head()


((20234, 13),
        id   price  price_per_guest  accommodates        room_type  \
 0   35797  3799.0        1899.5000           2.0  Entire home/apt   
 1   56074   585.0         292.5000           2.0  Entire home/apt   
 2   67703  1696.0         424.0000           4.0  Entire home/apt   
 3   70644  1004.0         502.0000           2.0  Entire home/apt   
 4  165772  4071.0         254.4375          16.0  Entire home/apt   
 
   neighbourhood_cleansed  minimum_nights  availability_365  \
 0  Cuajimalpa de Morelos             1.0             364.0   
 1             Cuauhtémoc            15.0             338.0   
 2             Cuauhtémoc             2.0             267.0   
 3               Coyoacán             3.0             211.0   
 4         Miguel Hidalgo             2.0             177.0   
 
    estimated_occupancy_l365d  estimated_revenue_l365d host_is_superhost  \
 0                        0.0                      0.0                 f   
 1                       30.0   

In [3]:
# Limpieza básica: quitar filas con NA
df = df.dropna().copy()
print("Filas después de eliminar NA:", df.shape[0])

# Convertir host_is_superhost a 0/1 (False/True)
df["host_is_superhost"] = (df["host_is_superhost"] == "t").astype(int)

# --------
# Target de REGRESIÓN: precio en log
# --------
df["price_log"] = np.log1p(df["price"])  # log(1 + price) para estabilizar

# --------
# Target de CLASIFICACIÓN: alta ocupación
#     alta_ocupación = ocupación > mediana del histórico
# --------
umbral_ocupacion = df["estimated_occupancy_l365d"].median()
df["high_occupancy"] = (df["estimated_occupancy_l365d"] > umbral_ocupacion).astype(int)

print("Umbral de alta ocupación:", umbral_ocupacion)

target_reg = "price_log"
target_clf = "high_occupancy"


Filas después de eliminar NA: 16521
Umbral de alta ocupación: 90.0


In [4]:
# Columnas que NO serán features (las quitamos de X)
cols_a_excluir = [
    "id",
    "price",
    "price_log",
    "estimated_occupancy_l365d",
    "high_occupancy",
]

feature_cols = [c for c in df.columns if c not in cols_a_excluir]

X = df[feature_cols].copy()
y_reg = df[target_reg].copy()
y_clf = df[target_clf].copy()

print("Features:", feature_cols)
print("Tamaño de X:", X.shape)
print("Tamaño de y_reg:", y_reg.shape)
print("Tamaño de y_clf:", y_clf.shape)


Features: ['price_per_guest', 'accommodates', 'room_type', 'neighbourhood_cleansed', 'minimum_nights', 'availability_365', 'estimated_revenue_l365d', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate']
Tamaño de X: (16521, 10)
Tamaño de y_reg: (16521,)
Tamaño de y_clf: (16521,)


In [5]:
# Identificar columnas numéricas y categóricas
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object", "category"]).columns.tolist()

print("Columnas numéricas:", numeric_features)
print("Columnas categóricas:", categorical_features)

# Transformador de columnas: escalar numéricas y one-hot a categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)


Columnas numéricas: ['price_per_guest', 'accommodates', 'minimum_nights', 'availability_365', 'estimated_revenue_l365d', 'host_is_superhost', 'host_response_rate', 'host_acceptance_rate']
Columnas categóricas: ['room_type', 'neighbourhood_cleansed']


In [6]:
X_train, X_test, y_reg_train, y_reg_test, y_clf_train, y_clf_test = train_test_split(
    X,
    y_reg,
    y_clf,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y_clf  # estratificar por la variable de clasificación
)

X_train.shape, X_test.shape


((13216, 10), (3305, 10))

In [8]:
# ===== Modelo base de REGRESIÓN (price_log) =====

regressor_base = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", MLPRegressor(
        hidden_layer_sizes=(64, 32),  # 2 capas ocultas
        max_iter=200,
        random_state=RANDOM_STATE,
        early_stopping=True
    ))
])

# Registrar en MLflow
mlflow.sklearn.autolog()
mlflow.set_experiment("etapa4b_andes_regresion")

with mlflow.start_run(run_name="mlp_regresion_base"):
    regressor_base.fit(X_train, y_reg_train)

    # Predicción en escala log
    y_reg_pred_log = regressor_base.predict(X_test)

    # Volver a precio original
    y_reg_test_price = np.expm1(y_reg_test)
    y_reg_pred_price = np.expm1(y_reg_pred_log)

    mae = mean_absolute_error(y_reg_test_price, y_reg_pred_price)

    mse = mean_squared_error(y_reg_test_price, y_reg_pred_price)
    rmse = np.sqrt(mse)

    r2 = r2_score(y_reg_test, y_reg_pred_log)

    mlflow.log_metric("mae_test_precio", mae)
    mlflow.log_metric("rmse_test_precio", rmse)
    mlflow.log_metric("r2_test_logprice", r2)

print("===== Resultados REGRESIÓN (modelo base) =====")
print("MAE (precio):", mae)
print("RMSE (precio):", rmse)
print("R2 (sobre log(price)):", r2)




===== Resultados REGRESIÓN (modelo base) =====
MAE (precio): 40.20372850865186
RMSE (precio): 188.52443674727164
R2 (sobre log(price)): 0.9963083732137799
