In [1]:
# Celda 1 — Imports, paths y config
import os, warnings, joblib
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from flaml import AutoML
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow

# Paths
TRAIN = "../data/splits/train.parquet"
VAL   = "../data/splits/val.parquet"
TEST  = "../data/splits/test.parquet"  # opcional

EXPORT_DIR  = "../models/06_flaml"
REPORTS_DIR = "../reports"
os.makedirs(EXPORT_DIR, exist_ok=True)
os.makedirs(REPORTS_DIR, exist_ok=True)

# Columnas clave
Y_COL  = "num_bikes_available"
ID_COL = "station_id"
TS_COL = "ts_local"

# MLflow
MLFLOW_TRACKING_URI = "mlruns"
EXPERIMENT_NAME     = "ecobici_automl_flaml"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

# Config AutoML
TIME_BUDGET_SEC = 600   # podés subir/bajar
METRIC          = "rmse"
SEED            = 42

print("Entorno OK")

2025/10/24 23:18:32 INFO mlflow.tracking.fluent: Experiment with name 'ecobici_automl_flaml' does not exist. Creating a new experiment.


Entorno OK


In [2]:
# Celda 2 — Cargar splits
def load_df(p):
    assert os.path.exists(p), f"No existe {p}"
    return pd.read_parquet(p)

df_tr = load_df(TRAIN)
df_va = load_df(VAL)
df_te = load_df(TEST) if os.path.exists(TEST) else pd.DataFrame()

print("train:", df_tr.shape, "| val:", df_va.shape, "| test:", df_te.shape if not df_te.empty else "(no)")
df_tr[[ID_COL, TS_COL, Y_COL]].head(3)

train: (152463, 30) | val: (32774, 30) | test: (32712, 30)


Unnamed: 0,station_id,ts_local,num_bikes_available
0,2,2025-10-03 08:54:39-03:00,7
1,2,2025-10-03 08:57:40-03:00,7
2,2,2025-10-03 09:00:41-03:00,7


In [3]:
# Celda 3 — Preparar X, y (solo numéricas; coherente con baseline)
def num_only(df: pd.DataFrame) -> pd.DataFrame:
    return df.select_dtypes(include=["number"]).copy()

def make_X_y(df: pd.DataFrame, y_col: str, id_col: str, ts_col: str):
    y = pd.to_numeric(df[y_col], errors="coerce").astype(float).values
    X = df.drop(columns=[c for c in [y_col, id_col, ts_col] if c in df.columns], errors="ignore")
    X = num_only(X).fillna(0.0)
    return X, y

Xtr, ytr = make_X_y(df_tr, Y_COL, ID_COL, TS_COL)
Xva, yva = make_X_y(df_va, Y_COL, ID_COL, TS_COL)

Xte = yte = None
if not df_te.empty:
    Xte, yte = make_X_y(df_te, Y_COL, ID_COL, TS_COL)

Xtr.shape, Xva.shape

((152463, 20), (32774, 20))

In [9]:
# Celda 4 — Entrenamiento FLAML + logs a archivo (para monitorear con Celda 4b)
import os, sys, time, logging, joblib
from flaml import AutoML
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow

# --- Configuración de logs
log_path = os.path.join(EXPORT_DIR, "flaml.log")
for name in ["flaml", "flaml.automl"]:
    lg = logging.getLogger(name)
    lg.handlers = []
    lg.propagate = False
    fh = logging.FileHandler(log_path, mode="w")
    fh.setLevel(logging.INFO)
    lg.addHandler(fh)
    lg.setLevel(logging.INFO)

# --- Entrenamiento AutoML
automl = AutoML()

with mlflow.start_run(run_name="flaml_automl"):
    automl.fit(
        X_train=Xtr,
        y_train=ytr,
        task="regression",
        time_budget=TIME_BUDGET_SEC,   # segundos de límite total
        metric=METRIC,
        eval_method="holdout",
        X_val=Xva,
        y_val=yva,
        seed=SEED,
        log_file_name=log_path,        # los logs se guardan aquí
        n_jobs=-1,
    )

    # --- Evaluación en VALIDACIÓN ---
    yhat_va = np.clip(automl.predict(Xva), 0, None)
    rmse = mean_squared_error(yva, yhat_va, squared=False)
    mae  = mean_absolute_error(yva, yhat_va)
    r2   = r2_score(yva, yhat_va)

    mlflow.log_param("time_budget_sec", TIME_BUDGET_SEC)
    mlflow.log_param("metric", METRIC)
    mlflow.log_param("seed", SEED)
    mlflow.log_param("best_estimator", automl.best_estimator)
    mlflow.log_metric("val_rmse", rmse)
    mlflow.log_metric("val_mae",  mae)
    mlflow.log_metric("val_r2",   r2)

    MODEL_PATH = os.path.join(EXPORT_DIR, "flaml_automl.pkl")
    joblib.dump(automl, MODEL_PATH)
    mlflow.log_artifact(MODEL_PATH)

rmse, mae, r2, automl.best_estimator

[flaml.automl.logger: 10-24 23:49:16] {1679} INFO - task = regression
[flaml.automl.logger: 10-24 23:49:16] {1687} INFO - Data split method: uniform
[flaml.automl.logger: 10-24 23:49:16] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 10-24 23:49:16] {1788} INFO - Minimizing error metric: rmse
[flaml.automl.logger: 10-24 23:49:16] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 10-24 23:49:16] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 10-24 23:49:16] {2344} INFO - Estimated sufficient time budget=3771s. Estimated necessary time budget=27s.
[flaml.automl.logger: 10-24 23:49:17] {2391} INFO -  at 0.4s,	estimator lgbm's best error=3.5214,	best estimator lgbm's best error=3.5214
[flaml.automl.logger: 10-24 23:49:17] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 10-24 23:49:17] {2391} INFO -  at 0.6s,	estimator lgbm's best error=3.5214,	best est

[flaml.automl.logger: 10-24 23:49:22] {2218} INFO - iteration 34, current learner xgb_limitdepth
[flaml.automl.logger: 10-24 23:49:22] {2391} INFO -  at 6.1s,	estimator xgb_limitdepth's best error=0.7428,	best estimator rf's best error=0.7235
[flaml.automl.logger: 10-24 23:49:22] {2218} INFO - iteration 35, current learner extra_tree
[flaml.automl.logger: 10-24 23:49:22] {2391} INFO -  at 6.3s,	estimator extra_tree's best error=0.7535,	best estimator rf's best error=0.7235
[flaml.automl.logger: 10-24 23:49:22] {2218} INFO - iteration 36, current learner xgb_limitdepth
[flaml.automl.logger: 10-24 23:49:22] {2391} INFO -  at 6.4s,	estimator xgb_limitdepth's best error=0.7428,	best estimator rf's best error=0.7235
[flaml.automl.logger: 10-24 23:49:22] {2218} INFO - iteration 37, current learner xgb_limitdepth
[flaml.automl.logger: 10-24 23:49:23] {2391} INFO -  at 6.4s,	estimator xgb_limitdepth's best error=0.6930,	best estimator xgb_limitdepth's best error=0.6930
[flaml.automl.logger: 10

[flaml.automl.logger: 10-24 23:50:08] {2218} INFO - iteration 68, current learner lgbm
[flaml.automl.logger: 10-24 23:50:09] {2391} INFO -  at 52.5s,	estimator lgbm's best error=0.3272,	best estimator lgbm's best error=0.3272
[flaml.automl.logger: 10-24 23:50:09] {2218} INFO - iteration 69, current learner lgbm
[flaml.automl.logger: 10-24 23:50:09] {2391} INFO -  at 52.7s,	estimator lgbm's best error=0.3272,	best estimator lgbm's best error=0.3272
[flaml.automl.logger: 10-24 23:50:09] {2218} INFO - iteration 70, current learner lgbm
[flaml.automl.logger: 10-24 23:50:13] {2391} INFO -  at 57.3s,	estimator lgbm's best error=0.3272,	best estimator lgbm's best error=0.3272
[flaml.automl.logger: 10-24 23:50:13] {2218} INFO - iteration 71, current learner lgbm
[flaml.automl.logger: 10-24 23:50:18] {2391} INFO -  at 61.7s,	estimator lgbm's best error=0.2052,	best estimator lgbm's best error=0.2052
[flaml.automl.logger: 10-24 23:50:18] {2218} INFO - iteration 72, current learner lgbm
[flaml.au

[flaml.automl.logger: 10-24 23:52:59] {2218} INFO - iteration 104, current learner xgboost
[flaml.automl.logger: 10-24 23:53:08] {2391} INFO -  at 232.3s,	estimator xgboost's best error=0.2982,	best estimator lgbm's best error=0.1956
[flaml.automl.logger: 10-24 23:53:08] {2218} INFO - iteration 105, current learner lgbm
[flaml.automl.logger: 10-24 23:53:23] {2391} INFO -  at 247.2s,	estimator lgbm's best error=0.1956,	best estimator lgbm's best error=0.1956
[flaml.automl.logger: 10-24 23:53:23] {2218} INFO - iteration 106, current learner xgboost
[flaml.automl.logger: 10-24 23:53:28] {2391} INFO -  at 252.2s,	estimator xgboost's best error=0.2982,	best estimator lgbm's best error=0.1956
[flaml.automl.logger: 10-24 23:53:28] {2218} INFO - iteration 107, current learner xgb_limitdepth
[flaml.automl.logger: 10-24 23:54:31] {2391} INFO -  at 314.6s,	estimator xgb_limitdepth's best error=0.3500,	best estimator lgbm's best error=0.1956
[flaml.automl.logger: 10-24 23:54:31] {2218} INFO - iter

(0.19552743623345384, 0.05782265137792971, 0.9984712642404889, 'lgbm')

In [12]:
# Celda 4c — Resumen de iteraciones FLAML (conteo, mejor modelo y tiempo total)
import os, re, datetime as dt
import pandas as pd
from pprint import pprint

log_path = os.path.join(EXPORT_DIR, "flaml.log")

summary = {
    "best_estimator": getattr(automl, "best_estimator", None),
    "best_loss_rmse": getattr(automl, "best_loss", None),  # ya está en escala de la métrica (rmse)
    "models_evaluated": list(getattr(automl, "best_config_per_estimator", {}).keys()),
}

# --- contar iteraciones y estimar duración leyendo el log ---
iters = 0
t0 = t1 = None
ts_pat = re.compile(r"(?P<mm>\d{2})-(?P<dd>\d{2}) (?P<hh>\d{2}):(?P<mi>\d{2}):(?P<ss>\d{2})")
iter_pat = re.compile(r"\biteration\s+\d+", re.IGNORECASE)
year = dt.datetime.now().year

if os.path.exists(log_path):
    with open(log_path, "r") as f:
        lines = f.readlines()
    # timestamps (primero y último)
    for line in lines:
        m = ts_pat.search(line)
        if m:
            t0 = dt.datetime(year, int(m["mm"]), int(m["dd"]), int(m["hh"]), int(m["mi"]), int(m["ss"]))
            break
    for line in reversed(lines):
        m = ts_pat.search(line)
        if m:
            t1 = dt.datetime(year, int(m["mm"]), int(m["dd"]), int(m["hh"]), int(m["mi"]), int(m["ss"]))
            break
    # iteraciones (conteo de líneas con "iteration k")
    iters = sum(1 for ln in lines if iter_pat.search(ln))

duration = (t1 - t0).total_seconds() if (t0 and t1) else None

summary.update({
    "iterations_seen": iters,
    "wall_time_sec": duration,
})

print("📝 Resumen FLAML")
pprint(summary)

# (opcional) tabla bonita
pd.DataFrame([summary])

📝 Resumen FLAML
{'best_estimator': 'lgbm',
 'best_loss_rmse': 0.19555331710017126,
 'iterations_seen': 0,
 'models_evaluated': ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth'],
 'wall_time_sec': None}


Unnamed: 0,best_estimator,best_loss_rmse,models_evaluated,iterations_seen,wall_time_sec
0,lgbm,0.195553,"[lgbm, rf, xgboost, extra_tree, xgb_limitdepth]",0,


In [10]:
# Celda 5 — (Opcional) Evaluación en TEST
test_metrics = {}
if Xte is not None:
    yhat_te = automl.predict(Xte)
    yhat_te = np.clip(yhat_te, 0, None)

    rmse_t = mean_squared_error(yte, yhat_te, squared=False)
    mae_t  = mean_absolute_error(yte, yhat_te)
    r2_t   = r2_score(yte, yhat_te)

    test_metrics = {"test_rmse": rmse_t, "test_mae": mae_t, "test_r2": r2_t}
    print(f"TEST → RMSE={rmse_t:.4f} | MAE={mae_t:.4f} | R2={r2_t:.4f}")
else:
    print("No hay TEST → omitido.")

TEST → RMSE=0.4185 | MAE=0.0690 | R2=0.9932


In [11]:
# Celda 6 — Actualizar benchmark común (reports/automl_bench.csv)
bench_path = os.path.join(REPORTS_DIR, "automl_bench.csv")
row = {
    "framework": "flaml",
    "model_path": MODEL_PATH,
    "val_rmse": rmse,
    "val_mae": mae,
    "val_r2": r2,
    **test_metrics
}
bench = pd.DataFrame([row])

if os.path.exists(bench_path):
    prev = pd.read_csv(bench_path)
    bench = pd.concat([prev, bench], ignore_index=True)

bench.to_csv(bench_path, index=False)
bench.tail(5)

Unnamed: 0,framework,model_path,val_rmse,val_mae,val_r2,test_rmse,test_mae,test_r2
0,flaml,../models/06_flaml/flaml_automl.pkl,0.195527,0.057823,0.998471,0.418451,0.069021,0.993236


### Celda 7 — Nota final e integración


- Modelo FLAML: {MODEL_PATH}
- Bench actualizado: {bench_path}
- Runs en MLflow → usá: make mlflow-ui

Próximo:
- Notebook de comparativa (baseline RF vs PyCaret vs FLAML) leyendo reports/automl_bench.csv
- Elegir ganador por menor val_rmse (empate → MAE, luego R²).
