# PASO 3 (simple y claro)

Objetivo: entrenar un primer modelo base, medir en validación y (opcional) chequear test. Guardar métricas y el modelo.

In [12]:
# Celda 1 — Imports y paths
import os, json, joblib
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

TRAIN   = "../data/splits/train.parquet"
VAL     = "../data/splits/val.parquet"
TEST    = "../data/splits/test.parquet"   # opcional
OUT_DIR = "../models/03D"                  # <- nota: salimos un nivel si el notebook vive en notebooks/
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs("../reports", exist_ok=True)   # idem

# Config columnas
Y_COL  = "num_bikes_available"
ID_COL = "station_id"
TS_COL = "ts_local"

In [13]:
# Celda 2 — Utilidades: expandir JSON-like y construir X/y

def _looks_like_json_dict(s: str) -> bool:
    s = str(s).strip()
    return s.startswith("{") and s.endswith("}")

def expand_json_like_columns(df: pd.DataFrame, exclude: list[str]) -> pd.DataFrame:
    """Detecta columnas object con dicts (JSON-like) y las expande a columnas numéricas."""
    df2 = df.copy()
    obj_cols = [c for c in df2.columns if c not in exclude and df2[c].dtype == "object"]

    for col in obj_cols:
        sample = df2[col].dropna().astype(str).head(50)
        if len(sample) == 0:
            continue
        if sample.map(_looks_like_json_dict).mean() >= 0.6:
            def _parse(x):
                try:
                    d = json.loads(x) if isinstance(x, str) else x
                    return d if isinstance(d, dict) else {}
                except Exception:
                    return {}
            exp = df2[col].apply(_parse).apply(pd.Series)
            if exp is not None and exp.shape[1] > 0:
                exp = exp.add_prefix(f"{col}_")
                for c in exp.columns:
                    exp[c] = pd.to_numeric(exp[c], errors="coerce").fillna(0.0)
                df2 = pd.concat([df2.drop(columns=[col]), exp], axis=1)
    return df2

def make_X_y(df: pd.DataFrame, y_col: str, id_col: str, ts_col: str):
    # remover columnas que no son features
    drop_cols = [c for c in [y_col, id_col, ts_col] if c in df.columns]
    X = df.drop(columns=drop_cols, errors="ignore").copy()

    # expandir JSON-like
    X = expand_json_like_columns(X, exclude=[])

    # one-hot de baja cardinalidad (<=20 categorías)
    low_card = [c for c in X.columns if X[c].dtype == "object" and X[c].nunique(dropna=True) <= 20]
    if low_card:
        X = pd.get_dummies(X, columns=low_card, drop_first=True)

    # quedarnos solo con numéricas
    X = X.select_dtypes(include=["number"]).copy().fillna(0.0)

    y = df[y_col].astype(float).copy()
    return X, y

In [14]:
# Celda 3 — Cargar datos y armar X/y
df_tr = pd.read_parquet(TRAIN)
df_va = pd.read_parquet(VAL)

Xtr, ytr = make_X_y(df_tr, Y_COL, ID_COL, TS_COL)
Xva, yva = make_X_y(df_va, Y_COL, ID_COL, TS_COL)

print("Shapes →", Xtr.shape, Xva.shape)
print("Ejemplo de columnas:", list(Xtr.columns)[:10])

Shapes → (152463, 22) (32774, 22)
Ejemplo de columnas: ['num_bikes_disabled', 'num_docks_available', 'num_docks_disabled', 'last_reported', 'is_installed', 'is_renting', 'is_returning', '_file_last_updated', 'lat', 'lon']


In [15]:
# Celda 4 — Baseline rápido (naive lag1 si está disponible)
baseline_rmse = baseline_mae = baseline_r2 = None
if "y_lag1" in Xva.columns:
    yhat_base = Xva["y_lag1"].values
    baseline_rmse = mean_squared_error(yva, yhat_base, squared=False)
    baseline_mae  = mean_absolute_error(yva, yhat_base)
    baseline_r2   = r2_score(yva, yhat_base)
    print(f"Baseline lag1 → RMSE={baseline_rmse:.4f} | MAE={baseline_mae:.4f} | R2={baseline_r2:.4f}")
else:
    print("No se encontró 'y_lag1' en features de validación: salto baseline.")

Baseline lag1 → RMSE=0.7243 | MAE=0.2318 | R2=0.9790




In [16]:
# Celda 5 — Entrenar modelo base (RF)
SEED = 42
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    random_state=SEED,
    n_jobs=-1
)
rf.fit(Xtr, ytr)

pred_val = rf.predict(Xva)
rmse = mean_squared_error(yva, pred_val, squared=False)
mae  = mean_absolute_error(yva, pred_val)
r2   = r2_score(yva, pred_val)

print(f"VAL → RMSE={rmse:.4f} | MAE={mae:.4f} | R2={r2:.4f}")
if baseline_rmse is not None:
    print(f"(Comparado baseline lag1 RMSE={baseline_rmse:.4f})")

VAL → RMSE=0.0267 | MAE=0.0004 | R2=1.0000
(Comparado baseline lag1 RMSE=0.7243)




In [17]:
# Celda 6 — (Opcional) Medir en TEST
rows = [{"split":"val","rmse":rmse,"mae":mae,"r2":r2}]
if baseline_rmse is not None:
    rows.append({"split":"baseline_lag1","rmse":baseline_rmse,"mae":baseline_mae,"r2":baseline_r2})

if os.path.exists(TEST):
    df_te = pd.read_parquet(TEST)
    Xt, yt = make_X_y(df_te, Y_COL, ID_COL, TS_COL)
    pred_t = rf.predict(Xt)
    rmse_t = mean_squared_error(yt, pred_t, squared=False)
    mae_t  = mean_absolute_error(yt, pred_t)
    r2_t   = r2_score(yt, pred_t)
    rows.append({"split":"test","rmse":rmse_t,"mae":mae_t,"r2":r2_t})
    print(f"TEST → RMSE={rmse_t:.4f} | MAE={mae_t:.4f} | R2={r2_t:.4f}")

TEST → RMSE=0.3729 | MAE=0.0197 | R2=0.9946




In [18]:
# Celda 7 — Guardar métricas, modelo e importancia de variables
rep = pd.DataFrame(rows)
rep.to_csv("../reports/metrics_by_split.csv", index=False)
print(rep)

model_path = os.path.join(OUT_DIR, "best_model.pkl")
joblib.dump(rf, model_path)
print(f"Modelo guardado en: {model_path}")

# Importancia de variables (si disponible)
fi_path = "../reports/feature_importance_rf.csv"
if hasattr(rf, "feature_importances_"):
    fi = pd.Series(rf.feature_importances_, index=Xtr.columns).sort_values(ascending=False)
    fi.to_csv(fi_path, header=["importance"])
    print(f"Feature importance guardada en: {fi_path}")
else:
    print("El modelo no expone feature_importances_.")

           split      rmse       mae        r2
0            val  0.026665  0.000376  0.999972
1  baseline_lag1  0.724330  0.231800  0.979021
2           test  0.372856  0.019723  0.994630
Modelo guardado en: ../models/03D/best_model.pkl
Feature importance guardada en: ../reports/feature_importance_rf.csv
