In [1]:
# Celda 1 — Imports, paths y config
import os, warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from pycaret.regression import setup, compare_models, finalize_model, save_model, predict_model
import mlflow

# Paths
TRAIN = "../data/splits/train.parquet"
VAL   = "../data/splits/val.parquet"
TEST  = "../data/splits/test.parquet"

EXPORT_DIR  = "../models/03A_pycaret"
REPORTS_DIR = "../reports"
os.makedirs(EXPORT_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)

# Columnas clave
Y_COL  = "num_bikes_available"
ID_COL = "station_id"
TS_COL = "ts_local"

# MLflow local
MLFLOW_TRACKING_URI = "mlruns"
EXPERIMENT_NAME     = "ecobici_pycaret_automl"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

2025/10/24 21:54:10 INFO mlflow.tracking.fluent: Experiment with name 'ecobici_pycaret_automl' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/ri1965/Desktop/ecobici-automl/notebooks/mlruns/512746952719777776', creation_time=1761353650328, experiment_id='512746952719777776', last_update_time=1761353650328, lifecycle_stage='active', name='ecobici_pycaret_automl', tags={}>

In [2]:
# Celda 2 — Cargar splits (sin más preprocesos: ya vienen listos del Paso 1)
df_tr = pd.read_parquet(TRAIN)
df_va = pd.read_parquet(VAL)
df_te = pd.read_parquet(TEST) if os.path.exists(TEST) else pd.DataFrame()

print("train:", df_tr.shape, "val:", df_va.shape, "test:", df_te.shape if not df_te.empty else "(no)")
df_tr[[ID_COL, TS_COL, Y_COL]].head()

train: (152463, 30) val: (32774, 30) test: (32712, 30)


Unnamed: 0,station_id,ts_local,num_bikes_available
0,2,2025-10-03 08:54:39-03:00,7
1,2,2025-10-03 08:57:40-03:00,7
2,2,2025-10-03 09:00:41-03:00,7
3,2,2025-10-03 09:03:42-03:00,7
4,2,2025-10-03 09:06:42-03:00,5


In [6]:
# Celda 3 — Configurar experimento en PyCaret (forzando solo features numéricas)

# 1) orden temporal y tipado del target
df_tr = df_tr.sort_values(TS_COL).reset_index(drop=True).copy()
df_tr[Y_COL] = pd.to_numeric(df_tr[Y_COL], errors="coerce")

# 2) quedarnos SOLO con columnas numéricas (excluyendo ID y TS)
num_feats = (
    df_tr.select_dtypes(include=["number"])
         .columns.drop([c for c in [Y_COL, ID_COL] if c in df_tr.columns], errors="ignore")
)
data_for_pycaret = df_tr[[Y_COL]].join(df_tr[num_feats]).copy()

print(f"Features numéricas usadas: {len(num_feats)}")
# 3) setup sin shuffle, validación temporal
s = setup(
    data=data_for_pycaret,
    target=Y_COL,
    session_id=42,
    fold_strategy="timeseries",
    fold=3,
    data_split_shuffle=False,
    fold_shuffle=False,
    # ya pasamos solo numéricas → no necesitamos ignorar más columnas
    ignore_features=None,
    normalize=False,
    log_experiment=True,
    experiment_name=EXPERIMENT_NAME,
    html=False,
    verbose=False
)
print("Setup OK.")

Features numéricas usadas: 20
Setup OK.


In [8]:
# Celda 4 — Comparar y elegir mejor modelo (AutoML)
best = compare_models(sort="RMSE")   # cambia la métrica si querés
best_final = finalize_model(best)    # refit final sobre todo el train

save_path = os.path.join(EXPORT_DIR, "pycaret_best_model")
_ = save_model(best_final, save_path)  # PyCaret agrega sufijo
print("Modelo guardado en:", save_path)

                                                                                

                                    Model     MAE      MSE    RMSE      R2  \
lr                      Linear Regression  0.0119   0.0122  0.0929  0.9995   
ridge                    Ridge Regression  0.0118   0.0122  0.0929  0.9995   
br                         Bayesian Ridge  0.0119   0.0122  0.0929  0.9995   
xgboost         Extreme Gradient Boosting  0.1580   0.1421  0.3757  0.9940   
lightgbm  Light Gradient Boosting Machine  0.1914   0.2186  0.4672  0.9908   
et                  Extra Trees Regressor  0.2289   0.2318  0.4813  0.9903   
rf                Random Forest Regressor  0.2625   0.2627  0.5124  0.9890   
gbr           Gradient Boosting Regressor  0.2318   0.3369  0.5783  0.9859   
omp           Orthogonal Matching Pursuit  0.2341   0.3610  0.5986  0.9849   
lar                Least Angle Regression  0.4155   0.9977  0.6099  0.9595   
llar         Lasso Least Angle Regression  0.3809   0.4157  0.6418  0.9826   
lasso                    Lasso Regression  0.3809   0.4157  0.64

In [9]:
# Celda 5 — Evaluación externa en VALIDATION
pred_val = predict_model(best_final, data=df_va.copy())
y_true = df_va[Y_COL].astype(float).values
y_hat  = pred_val["prediction_label"].astype(float).values

rmse = mean_squared_error(y_true, y_hat, squared=False)
mae  = mean_absolute_error(y_true, y_hat)
r2   = r2_score(y_true, y_hat)

print(f"VALIDACIÓN → RMSE={rmse:.4f} | MAE={mae:.4f} | R2={r2:.4f}")

# Guardar/actualizar tabla de métricas
rep_path = os.path.join(REPORTS_DIR, "automl_metrics_by_split.csv")
row = pd.DataFrame([{"framework":"pycaret", "split":"val", "rmse":rmse, "mae":mae, "r2":r2}])
if os.path.exists(rep_path):
    prev = pd.read_csv(rep_path)
    row  = pd.concat([prev, row], ignore_index=True)
row.to_csv(rep_path, index=False)
row.tail(3)

               Model     MAE     MSE    RMSE      R2   RMSLE    MAPE
0  Linear Regression  0.1899  0.0375  0.1936  0.9985  0.0701  0.0583
VALIDACIÓN → RMSE=0.1936 | MAE=0.1899 | R2=0.9985


Unnamed: 0,framework,split,rmse,mae,r2
0,pycaret,val,0.193582,0.189921,0.998502


In [10]:
# Celda 6 — (Opcional) Evaluación externa en TEST
if not df_te.empty:
    pred_test = predict_model(best_final, data=df_te.copy())
    yt = df_te[Y_COL].astype(float).values
    yh = pred_test["prediction_label"].astype(float).values

    rmse_t = mean_squared_error(yt, yh, squared=False)
    mae_t  = mean_absolute_error(yt, yh)
    r2_t   = r2_score(yt, yh)

    print(f"TEST → RMSE={rmse_t:.4f} | MAE={mae_t:.4f} | R2={r2_t:.4f}")

    rep = pd.read_csv(rep_path) if os.path.exists(rep_path) else pd.DataFrame()
    rep = pd.concat([rep, pd.DataFrame([{"framework":"pycaret","split":"test","rmse":rmse_t,"mae":mae_t,"r2":r2_t}])], ignore_index=True)
    rep.to_csv(rep_path, index=False)
    rep.tail(5)
else:
    print("No hay TEST → omitido.")

               Model     MAE     MSE    RMSE      R2   RMSLE    MAPE
0  Linear Regression  0.1895  0.0367  0.1916  0.9986  0.0742  0.0622
TEST → RMSE=0.1916 | MAE=0.1895 | R2=0.9986


### Celda 7 — Nota de integración con tu pipeline
Salida clave:
- Modelo PyCaret guardado en: models/03A_pycaret/pycaret_best_model*.pkl
- Métricas en: reports/automl_metrics_by_split.csv
- Runs en MLflow (usa: make mlflow-ui)

Próximo:
- (Etapa 7) Comparar contra baseline y FLAML (si corresponde), y registrar en MLflow Model Registry.
- (Etapa 8) Integrar el mejor modelo en src/predict_batch.py para el dashboard.


In [14]:
# Celda 4c — Resumen PyCaret AutoML (muestra el estimador interno, métricas y tiempo)

import os, re, datetime as dt
import pandas as pd
from pprint import pprint
from pycaret.regression import pull, get_metrics

summary = {}

# --- 1) Modelo ganador: mostrar el ESTIMADOR interno (no el Pipeline)
try:
    winner_step = best_final.steps[-1][1]          # último step del pipeline
    summary["best_model_class"] = type(winner_step).__name__
except Exception:
    summary["best_model_class"] = type(best_final).__name__

# --- 2) Tabla de comparación (si la guardaste justo tras compare_models)
#     Sugerencia en Celda 4 del notebook: después de compare_models(...) hacé:
#         compare_results = pull().copy()
compare_results = globals().get("compare_results", None)

# fallback: intentar que el último pull sea la tabla de comparación
if compare_results is None:
    try:
        df_last = pull()
        if isinstance(df_last, pd.DataFrame) and {"Model", "RMSE"}.issubset(df_last.columns):
            compare_results = df_last.copy()
    except Exception:
        compare_results = None

# --- 3) Métricas agregadas de la comparación (si existen)
if isinstance(compare_results, pd.DataFrame) and not compare_results.empty:
    summary["models_evaluated"] = int(compare_results["Model"].nunique())
    summary["best_RMSE"] = float(compare_results["RMSE"].min()) if "RMSE" in compare_results.columns else None
    summary["best_MAE"]  = float(compare_results["MAE"].min())  if "MAE"  in compare_results.columns else None
    summary["best_R2"]   = float(compare_results["R2"].max())   if "R2"   in compare_results.columns else None
else:
    summary.update({"models_evaluated": None, "best_RMSE": None, "best_MAE": None, "best_R2": None})

# --- 4) (Opcional) tiempo total estimado leyendo un log plano si existiera
#     Si no generaste un log local, esto quedará en None (MLflow ya tiene tiempos exactos en la UI).
log_path = os.path.join(EXPORT_DIR, f"{EXPERIMENT_NAME}.log") if "EXPORT_DIR" in globals() else None
ts_pat = re.compile(r"(?P<mm>\d{2})-(?P<dd>\d{2}) (?P<hh>\d{2}):(?P<mi>\d{2}):(?P<ss>\d{2})")
year = dt.datetime.now().year
t0 = t1 = None

if log_path and os.path.exists(log_path):
    with open(log_path, "r") as f:
        lines = f.readlines()
    for ln in lines:
        m = ts_pat.search(ln)
        if m:
            t0 = dt.datetime(year, int(m["mm"]), int(m["dd"]), int(m["hh"]), int(m["mi"]), int(m["ss"]))
            break
    for ln in reversed(lines):
        m = ts_pat.search(ln)
        if m:
            t1 = dt.datetime(year, int(m["mm"]), int(m["dd"]), int(m["hh"]), int(m["mi"]), int(m["ss"]))
            break

summary["wall_time_sec"] = (t1 - t0).total_seconds() if (t0 and t1) else None

print("🧾 Resumen PyCaret AutoML")
pprint(summary)

# (Opcional) mostrar también en DataFrame
pd.DataFrame([summary])

🧾 Resumen PyCaret AutoML
{'best_MAE': 0.1895,
 'best_R2': 0.9986,
 'best_RMSE': 0.1916,
 'best_model_class': 'LinearRegression',
 'models_evaluated': 1,
 'wall_time_sec': None}


Unnamed: 0,best_model_class,models_evaluated,best_RMSE,best_MAE,best_R2,wall_time_sec
0,LinearRegression,1,0.1916,0.1895,0.9986,
