In [76]:
import os, mlflow
from dotenv import load_dotenv

load_dotenv(override=True)  # Carga las variables del archivo .env
EXPERIMENT_NAME = "/Users/priscila.cervantes@iteso.mx/nyc-taxi-experiments"

mlflow.set_tracking_uri("databricks")
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

2025/10/28 21:33:41 INFO mlflow.tracking.fluent: Experiment with name '/Users/priscila.cervantes@iteso.mx/nyc-taxi-experiments' does not exist. Creating a new experiment.


## Train Model

In [77]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer

In [78]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [79]:
df_train = read_dataframe('C:/Users/prisc/apps/nyc-taxi-predictions-2025/data/green_tripdata_2025-01.parquet')
df_val = read_dataframe('C:/Users/prisc/apps/nyc-taxi-predictions-2025/data/green_tripdata_2025-02.parquet')

In [80]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

X_val = preprocess(df_val, dv)

In [81]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [82]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

## Optimización de hiperparámetros usando Optuna

In [83]:
import math
import optuna
import pathlib
import xgboost as xgb
from optuna.samplers import TPESampler
from mlflow.models.signature import infer_signature

In [84]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

## Función objetivo

In [85]:
# ------------------------------------------------------------
# Definir la función objetivo para Optuna
#    - Recibe un `trial`, que se usa para proponer hiperparámetros.
#    - Entrena un modelo con esos hiperparámetros.
#    - Calcula la métrica de validación (RMSE) y la retorna (Optuna la minimizará).
#    - Abrimos un run anidado de MLflow para registrar cada trial.
# ------------------------------------------------------------
def objective(trial: optuna.trial.Trial):
    # Hiperparámetros MUESTREADOS por Optuna en CADA trial.
    # Nota: usamos log=True para emular rangos log-uniformes (similar a loguniform).
    params = {
        "max_depth": trial.suggest_int("max_depth", 4, 100),
        "learning_rate": trial.suggest_float("learning_rate", math.exp(-3), 1.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha",   math.exp(-5), math.exp(-1), log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", math.exp(-6), math.exp(-1), log=True),
        "min_child_weight": trial.suggest_float("min_child_weight", math.exp(-1), math.exp(3), log=True),
        "objective": "reg:squarederror",  
        "seed": 42,                      
    }

    # Run anidado para dejar rastro de cada trial en MLflow
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "xgboost")  # etiqueta informativa
        mlflow.log_params(params)                  # registra hiperparámetros del trial

        # Entrenamiento con early stopping en el conjunto de validación
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=100,
            evals=[(valid, "validation")],
            early_stopping_rounds=10,
        )

        # Predicción y métrica en validación
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Registrar la métrica principal
        mlflow.log_metric("rmse", rmse)

        # La "signature" describe la estructura esperada de entrada y salida del modelo:
        # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
        # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
        signature = infer_signature(X_val, y_pred)

        # Guardar el modelo del trial como artefacto en MLflow.
        mlflow.xgboost.log_model(
            booster,
            name="model",
            input_example=X_val[:5],
            signature=signature
        )

    # Optuna minimiza el valor retornado
    return rmse

## Flujo de búsqueda

In [86]:
mlflow.xgboost.autolog(log_models=False)

# ------------------------------------------------------------
# Crear el estudio de Optuna
#    - Usamos TPE (Tree-structured Parzen Estimator) como sampler.
#    - direction="minimize" porque queremos minimizar el RMSE.
# ------------------------------------------------------------
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="minimize", sampler=sampler)

# ------------------------------------------------------------
# Ejecutar la optimización (n_trials = número de intentos)
#    - Cada trial ejecuta la función objetivo con un set distinto de hiperparámetros.
#    - Abrimos un run "padre" para agrupar toda la búsqueda.
# ------------------------------------------------------------
with mlflow.start_run(run_name="XGBoost Hyperparameter Optimization (Optuna)", nested=True):
    study.optimize(objective, n_trials=10)

    # --------------------------------------------------------
    # Recuperar y registrar los mejores hiperparámetros
    # --------------------------------------------------------
    best_params = study.best_params
    # Asegurar tipos/campos fijos (por claridad y consistencia)
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["seed"] = 42
    best_params["objective"] = "reg:squarederror"

    mlflow.log_params(best_params)

    # Etiquetas del run "padre" (metadatos del experimento)
    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "xgboost",
        "feature_set_version": 1,
    })

    # --------------------------------------------------------
    # 7) Entrenar un modelo FINAL con los mejores hiperparámetros
    #    (normalmente se haría sobre train+val o con CV; aquí mantenemos el patrón original)
    # --------------------------------------------------------
    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=100,
        evals=[(valid, "validation")],
        early_stopping_rounds=10,
    )

    # Evaluar y registrar la métrica final en validación
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    # --------------------------------------------------------
    # 8) Guardar artefactos adicionales (p. ej. el preprocesador)
    # --------------------------------------------------------
    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    # La "signature" describe la estructura esperada de entrada y salida del modelo:
    # incluye los nombres, tipos y forma (shape) de las variables de entrada y el tipo de salida.
    # MLflow la usa para validar datos en inferencia y documentar el modelo en el Model Registry.
    # Si X_val es la matriz dispersa (scipy.sparse) salida de DictVectorizer:
    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)

    # Para que las longitudes coincidan, usa el mismo slice en y_pred
    signature = infer_signature(input_example, y_val[:5])

    # Guardar el modelo del trial como artefacto en MLflow.
    mlflow.xgboost.log_model(
        booster,
        name="model",
        input_example=input_example,
        signature=signature
    )

[I 2025-10-28 21:34:23,417] A new study created in memory with name: no-name-54c6faa4-9dde-4b5f-82e5-960ea75f07cf


[0]	validation-rmse:5.72427
[1]	validation-rmse:5.57860
[2]	validation-rmse:5.56409
[3]	validation-rmse:5.56982
[4]	validation-rmse:5.57347
[5]	validation-rmse:5.55585
[6]	validation-rmse:5.55736
[7]	validation-rmse:5.55253
[8]	validation-rmse:5.55232
[9]	validation-rmse:5.53322
[10]	validation-rmse:5.53156
[11]	validation-rmse:5.53006
[12]	validation-rmse:5.52808
[13]	validation-rmse:5.52782
[14]	validation-rmse:5.52451
[15]	validation-rmse:5.52324
[16]	validation-rmse:5.52269
[17]	validation-rmse:5.52229
[18]	validation-rmse:5.52366
[19]	validation-rmse:5.52900
[20]	validation-rmse:5.52988
[21]	validation-rmse:5.52969
[22]	validation-rmse:5.52872
[23]	validation-rmse:5.53195
[24]	validation-rmse:5.52894
[25]	validation-rmse:5.52702
[26]	validation-rmse:5.53066


  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 21:37:23,247] Trial 0 finished with value: 5.5334711429183985 and parameters: {'max_depth': 40, 'learning_rate': 0.8625543817410922, 'reg_alpha': 0.12593061066249622, 'reg_lambda': 0.049454235173237264, 'min_child_weight': 0.6866535292359801}. Best is trial 0 with value: 5.5334711429183985.


🏃 View run secretive-skink-307 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/cdecf9bb570b46d7ac946c6d11d2e2fa
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042
[0]	validation-rmse:8.77707
[1]	validation-rmse:8.47452
[2]	validation-rmse:8.19669
[3]	validation-rmse:7.94182
[4]	validation-rmse:7.70901
[5]	validation-rmse:7.49664
[6]	validation-rmse:7.30245
[7]	validation-rmse:7.12626
[8]	validation-rmse:6.96581
[9]	validation-rmse:6.81994
[10]	validation-rmse:6.68775
[11]	validation-rmse:6.56832
[12]	validation-rmse:6.46039
[13]	validation-rmse:6.36272
[14]	validation-rmse:6.27449
[15]	validation-rmse:6.19394
[16]	validation-rmse:6.12203
[17]	validation-rmse:6.05669
[18]	validation-rmse:5.99838
[19]	validation-rmse:5.94487
[20]	validation-rmse:5.89709
[21]	validation-rmse:5.85472
[22]	validation-rmse:5.81565
[23]	validation-rmse:5.78117
[24]	validation-rmse:5.74956
[25]	validation-rmse:5.72125
[26

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 21:38:04,749] Trial 1 finished with value: 5.410463874732254 and parameters: {'max_depth': 19, 'learning_rate': 0.059264241587996896, 'reg_alpha': 0.21539205131792016, 'reg_lambda': 0.05006540936006931, 'min_child_weight': 6.248180561354165}. Best is trial 1 with value: 5.410463874732254.


🏃 View run nervous-whale-511 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/83d76e5a10c041babbd32e3b3dea126b
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042
[0]	validation-rmse:5.85130
[1]	validation-rmse:5.74043
[2]	validation-rmse:5.72146
[3]	validation-rmse:5.71927
[4]	validation-rmse:5.70906
[5]	validation-rmse:5.70183
[6]	validation-rmse:5.68997
[7]	validation-rmse:5.67941
[8]	validation-rmse:5.67714
[9]	validation-rmse:5.67594
[10]	validation-rmse:5.67850
[11]	validation-rmse:5.67582
[12]	validation-rmse:5.67678
[13]	validation-rmse:5.66184
[14]	validation-rmse:5.65662
[15]	validation-rmse:5.65596
[16]	validation-rmse:5.65571
[17]	validation-rmse:5.65327
[18]	validation-rmse:5.64906
[19]	validation-rmse:5.65130
[20]	validation-rmse:5.64476
[21]	validation-rmse:5.64460
[22]	validation-rmse:5.64547
[23]	validation-rmse:5.64525
[24]	validation-rmse:5.64959
[25]	validation-rmse:5.64695
[26]	

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 21:38:32,372] Trial 2 finished with value: 5.609300918544036 and parameters: {'max_depth': 5, 'learning_rate': 0.9136840519292247, 'reg_alpha': 0.18820387978911576, 'reg_lambda': 0.007166739666045858, 'min_child_weight': 0.7613210498541186}. Best is trial 1 with value: 5.410463874732254.


🏃 View run rambunctious-pig-750 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/c52bf9ce66ba43a98e3728d5cc35341d
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042
[0]	validation-rmse:8.42254
[1]	validation-rmse:7.85630
[2]	validation-rmse:7.39452
[3]	validation-rmse:7.01544
[4]	validation-rmse:6.70564
[5]	validation-rmse:6.45556
[6]	validation-rmse:6.25581
[7]	validation-rmse:6.09283
[8]	validation-rmse:5.96479
[9]	validation-rmse:5.85869
[10]	validation-rmse:5.77467
[11]	validation-rmse:5.70770
[12]	validation-rmse:5.65378
[13]	validation-rmse:5.61140
[14]	validation-rmse:5.57614
[15]	validation-rmse:5.54766
[16]	validation-rmse:5.52424
[17]	validation-rmse:5.50393
[18]	validation-rmse:5.48481
[19]	validation-rmse:5.47111
[20]	validation-rmse:5.45949
[21]	validation-rmse:5.45074
[22]	validation-rmse:5.44255
[23]	validation-rmse:5.43633
[24]	validation-rmse:5.42916
[25]	validation-rmse:5.42227
[2

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 21:39:15,506] Trial 3 finished with value: 5.362717739173819 and parameters: {'max_depth': 21, 'learning_rate': 0.12402485733085497, 'reg_alpha': 0.054969638498598095, 'reg_lambda': 0.02148769342025257, 'min_child_weight': 1.1792947151892554}. Best is trial 3 with value: 5.362717739173819.


🏃 View run merciful-wasp-230 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/56d1062fb8524baf967bce9817735caf
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042
[0]	validation-rmse:8.66900
[1]	validation-rmse:8.27724
[2]	validation-rmse:7.92728
[3]	validation-rmse:7.61583
[4]	validation-rmse:7.33855
[5]	validation-rmse:7.09228
[6]	validation-rmse:6.87486
[7]	validation-rmse:6.68312
[8]	validation-rmse:6.51512
[9]	validation-rmse:6.36795
[10]	validation-rmse:6.23874
[11]	validation-rmse:6.12574
[12]	validation-rmse:6.02624
[13]	validation-rmse:5.93991
[14]	validation-rmse:5.86418
[15]	validation-rmse:5.79827
[16]	validation-rmse:5.74190
[17]	validation-rmse:5.69254
[18]	validation-rmse:5.65012
[19]	validation-rmse:5.61385
[20]	validation-rmse:5.58229
[21]	validation-rmse:5.55495
[22]	validation-rmse:5.53061
[23]	validation-rmse:5.50968
[24]	validation-rmse:5.49174
[25]	validation-rmse:5.47625
[26]	

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 21:40:56,902] Trial 4 finished with value: 5.350864717798953 and parameters: {'max_depth': 63, 'learning_rate': 0.07565903471570516, 'reg_alpha': 0.021678779375600917, 'reg_lambda': 0.015480241912324163, 'min_child_weight': 2.2802382585441565}. Best is trial 4 with value: 5.350864717798953.


🏃 View run puzzled-snipe-422 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/b1e94feef25e4e2fa49cc01e302b3efb
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042
[0]	validation-rmse:8.58919
[1]	validation-rmse:8.14344
[2]	validation-rmse:7.75791
[3]	validation-rmse:7.41843
[4]	validation-rmse:7.12684
[5]	validation-rmse:6.87995
[6]	validation-rmse:6.66664
[7]	validation-rmse:6.49156
[8]	validation-rmse:6.33790
[9]	validation-rmse:6.21237
[10]	validation-rmse:6.09043
[11]	validation-rmse:6.00246
[12]	validation-rmse:5.92540
[13]	validation-rmse:5.85087
[14]	validation-rmse:5.79602
[15]	validation-rmse:5.74859
[16]	validation-rmse:5.71014
[17]	validation-rmse:5.67349
[18]	validation-rmse:5.64618
[19]	validation-rmse:5.62571
[20]	validation-rmse:5.60377
[21]	validation-rmse:5.58592
[22]	validation-rmse:5.57306
[23]	validation-rmse:5.55814
[24]	validation-rmse:5.54897
[25]	validation-rmse:5.54108
[26]	

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 21:43:49,759] Trial 5 finished with value: 5.479673192795726 and parameters: {'max_depth': 80, 'learning_rate': 0.0906292152736207, 'reg_alpha': 0.05270408847118816, 'reg_lambda': 0.04793414660944966, 'min_child_weight': 0.4429943118354462}. Best is trial 4 with value: 5.350864717798953.


🏃 View run unleashed-pig-427 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/61b3921c0a414e2991ff508374243ec2
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042
[0]	validation-rmse:8.65885
[1]	validation-rmse:8.26292
[2]	validation-rmse:7.91356
[3]	validation-rmse:7.60600
[4]	validation-rmse:7.33639
[5]	validation-rmse:7.10061
[6]	validation-rmse:6.89529
[7]	validation-rmse:6.71671
[8]	validation-rmse:6.56176
[9]	validation-rmse:6.42779
[10]	validation-rmse:6.31135
[11]	validation-rmse:6.21121
[12]	validation-rmse:6.12498
[13]	validation-rmse:6.05098
[14]	validation-rmse:5.98728
[15]	validation-rmse:5.93214
[16]	validation-rmse:5.88454
[17]	validation-rmse:5.84362
[18]	validation-rmse:5.80836
[19]	validation-rmse:5.77731
[20]	validation-rmse:5.74992
[21]	validation-rmse:5.72638
[22]	validation-rmse:5.70614
[23]	validation-rmse:5.68871
[24]	validation-rmse:5.67341
[25]	validation-rmse:5.66015
[26]	

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 21:44:47,757] Trial 6 finished with value: 5.490218802659253 and parameters: {'max_depth': 62, 'learning_rate': 0.08304043435235499, 'reg_alpha': 0.008740449782948887, 'reg_lambda': 0.28491274207986833, 'min_child_weight': 17.505727836123448}. Best is trial 4 with value: 5.350864717798953.


🏃 View run bouncy-deer-590 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/5349cc150e954fdc93c72271141641b7
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042
[0]	validation-rmse:8.39542
[1]	validation-rmse:7.80634
[2]	validation-rmse:7.32181
[3]	validation-rmse:6.92746
[4]	validation-rmse:6.60765
[5]	validation-rmse:6.35062
[6]	validation-rmse:6.14572
[7]	validation-rmse:5.98047
[8]	validation-rmse:5.84935
[9]	validation-rmse:5.74742
[10]	validation-rmse:5.66779
[11]	validation-rmse:5.60712
[12]	validation-rmse:5.55894
[13]	validation-rmse:5.52208
[14]	validation-rmse:5.49258
[15]	validation-rmse:5.46956
[16]	validation-rmse:5.45197
[17]	validation-rmse:5.43876
[18]	validation-rmse:5.42803
[19]	validation-rmse:5.41909
[20]	validation-rmse:5.41089
[21]	validation-rmse:5.40521
[22]	validation-rmse:5.40092
[23]	validation-rmse:5.39787
[24]	validation-rmse:5.39264
[25]	validation-rmse:5.38880
[26]	va

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 21:46:36,958] Trial 7 finished with value: 5.363914469142168 and parameters: {'max_depth': 82, 'learning_rate': 0.12416316985362412, 'reg_alpha': 0.009958672056108932, 'reg_lambda': 0.0758623422350637, 'min_child_weight': 2.1395809133199974}. Best is trial 4 with value: 5.350864717798953.


🏃 View run caring-deer-207 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/393545a68493428399ee4cd76ef3041c
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042
[0]	validation-rmse:7.93485
[1]	validation-rmse:7.12335
[2]	validation-rmse:6.57205
[3]	validation-rmse:6.21207
[4]	validation-rmse:5.96818
[5]	validation-rmse:5.80612
[6]	validation-rmse:5.69974
[7]	validation-rmse:5.62792
[8]	validation-rmse:5.57476
[9]	validation-rmse:5.53568
[10]	validation-rmse:5.50954
[11]	validation-rmse:5.49210
[12]	validation-rmse:5.47949
[13]	validation-rmse:5.46654
[14]	validation-rmse:5.45983
[15]	validation-rmse:5.45624
[16]	validation-rmse:5.45286
[17]	validation-rmse:5.45058
[18]	validation-rmse:5.44780
[19]	validation-rmse:5.44653
[20]	validation-rmse:5.44299
[21]	validation-rmse:5.44208
[22]	validation-rmse:5.44028
[23]	validation-rmse:5.43868
[24]	validation-rmse:5.43611
[25]	validation-rmse:5.43490
[26]	va

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 21:47:16,190] Trial 8 finished with value: 5.372302733772477 and parameters: {'max_depth': 15, 'learning_rate': 0.21992487468175848, 'reg_alpha': 0.007731550026907306, 'reg_lambda': 0.23377457337376373, 'min_child_weight': 1.0357439143907545}. Best is trial 4 with value: 5.350864717798953.


🏃 View run dazzling-fox-511 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/5b83d3d44e4549fd8c878ac8ce1f5e35
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042
[0]	validation-rmse:8.39575
[1]	validation-rmse:7.81543
[2]	validation-rmse:7.34295
[3]	validation-rmse:6.94812
[4]	validation-rmse:6.64996
[5]	validation-rmse:6.40028
[6]	validation-rmse:6.20838
[7]	validation-rmse:6.04823
[8]	validation-rmse:5.93009
[9]	validation-rmse:5.83558
[10]	validation-rmse:5.75569
[11]	validation-rmse:5.69848
[12]	validation-rmse:5.65635
[13]	validation-rmse:5.61594
[14]	validation-rmse:5.59049
[15]	validation-rmse:5.56877
[16]	validation-rmse:5.55140
[17]	validation-rmse:5.53337
[18]	validation-rmse:5.52224
[19]	validation-rmse:5.51339
[20]	validation-rmse:5.50408
[21]	validation-rmse:5.50166
[22]	validation-rmse:5.49717
[23]	validation-rmse:5.48976
[24]	validation-rmse:5.48665
[25]	validation-rmse:5.48327
[26]	v

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)
[I 2025-10-28 21:49:15,162] Trial 9 finished with value: 5.460277869585555 and parameters: {'max_depth': 68, 'learning_rate': 0.1268351874747755, 'reg_alpha': 0.05394836382863035, 'reg_lambda': 0.03814164293595655, 'min_child_weight': 0.7706028272535065}. Best is trial 4 with value: 5.350864717798953.


🏃 View run likeable-boar-889 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/5ed4949aba144c1cb8e4f76b06998993
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042
[0]	validation-rmse:8.66900
[1]	validation-rmse:8.27724
[2]	validation-rmse:7.92728
[3]	validation-rmse:7.61583
[4]	validation-rmse:7.33855
[5]	validation-rmse:7.09228
[6]	validation-rmse:6.87486
[7]	validation-rmse:6.68312
[8]	validation-rmse:6.51512
[9]	validation-rmse:6.36795
[10]	validation-rmse:6.23874
[11]	validation-rmse:6.12574
[12]	validation-rmse:6.02624
[13]	validation-rmse:5.93991
[14]	validation-rmse:5.86418
[15]	validation-rmse:5.79827
[16]	validation-rmse:5.74190
[17]	validation-rmse:5.69254
[18]	validation-rmse:5.65012
[19]	validation-rmse:5.61385
[20]	validation-rmse:5.58229
[21]	validation-rmse:5.55495
[22]	validation-rmse:5.53061
[23]	validation-rmse:5.50968
[24]	validation-rmse:5.49174
[25]	validation-rmse:5.47625
[26]	

  xgb_model.save_model(model_data_path)


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


🏃 View run XGBoost Hyperparameter Optimization (Optuna) at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/1c88711fc4ae4d0db27b73ef9f0f68ea
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042


## Registrar modelo en 'Model Registry'

### Método 1: Manual

In [87]:
model_name = "workspace.default.nyc-taxi-model"

In [90]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="workspace.default.nyc-taxi-model"
)

Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Created version '11' of model 'workspace.default.nyc-taxi-model'.


### Método 2: Automático

In [89]:
runs = mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
    order_by=["metrics.rmse ASC"],
    output_format="list"
)

# Obtener el mejor run
if len(runs) > 0:
    best_run = runs[0]
    print("🏆 Champion Run encontrado:")
    print(f"Run ID: {best_run.info.run_id}")
    print(f"RMSE: {best_run.data.metrics['rmse']}")
    print(f"Params: {best_run.data.params}")
else:
    print("⚠️ No se encontraron runs con métrica RMSE.")

🏆 Champion Run encontrado:
Run ID: b1e94feef25e4e2fa49cc01e302b3efb
RMSE: 5.350864717798953
Params: {'custom_metric': 'None', 'early_stopping_rounds': '10', 'learning_rate': '0.07565903471570516', 'max_depth': '63', 'maximize': 'None', 'min_child_weight': '2.2802382585441565', 'num_boost_round': '100', 'objective': 'reg:squarederror', 'reg_alpha': '0.021678779375600917', 'reg_lambda': '0.015480241912324163', 'seed': '42', 'verbose_eval': 'True'}


In [91]:
run_id = best_run.info.run_id

In [92]:
result = mlflow.register_model(
    model_uri=f"runs:/{best_run.info.run_id}/model",
    name=model_name
)

Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Created version '12' of model 'workspace.default.nyc-taxi-model'.


## Asignar alias

In [93]:
from mlflow import MlflowClient

client = MlflowClient()

In [94]:
model_version = result.version
new_alias = "Champion"

client.set_registered_model_alias(
    name=model_name,
    alias=new_alias,
    version=result.version
)

In [95]:
from datetime import datetime

date = datetime.today()

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=[], creation_timestamp=1761709949240, current_stage=None, deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description=('The model version 12 was transitioned to Champion on 2025-10-28 '
 '21:53:19.804600'), last_updated_timestamp=1761709996374, metrics=[<Metric: dataset_digest='', dataset_name='', key='best_iteration', model_id='m-5746a243e69c4ab996b82f3d19df9f3e', run_id='b1e94feef25e4e2fa49cc01e302b3efb', step=0, timestamp=1761709214676, value=96.0>,
 <Metric: dataset_digest='', dataset_name='', key='rmse', model_id='m-5746a243e69c4ab996b82f3d19df9f3e', run_id='b1e94feef25e4e2fa49cc01e302b3efb', step=0, timestamp=1761709232081, value=5.350864717798953>,
 <Metric: dataset_digest='', dataset_name='', key='stopped_iteration', model_id='m-5746a243e69c4ab996b82f3d19df9f3e', run_id='b1e94feef25e4e2fa49cc01e302b3

## Obteniendo modelos de model registry

In [96]:
import mlflow.pyfunc

model_version_uri = f"models:/{model_name}@Champion"

champion_version = mlflow.pyfunc.load_model(model_version_uri)
champion_version.predict(X_val)

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


array([ 5.711086, 26.219116, 28.056229, ..., 21.681288, 15.960737,
       22.701014], shape=(44218,), dtype=float32)