## Tarea 

In [220]:
import os, mlflow
from dotenv import load_dotenv
import pickle
import pandas as pd
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_extraction import DictVectorizer

## Configuración de MLFlow 

In [221]:
load_dotenv(override=True)
EXPERIMENT_NAME = "/Users/priscila.cervantes@iteso.mx/nyc-taxi-experiments"

mlflow.set_tracking_uri("databricks")
experiment =  mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

## Pre-procesamiento

In [222]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [223]:
df_train = read_dataframe("../data/green_tripdata_2025-01.parquet")
df_val = read_dataframe("../data/green_tripdata_2025-02.parquet")


In [224]:
def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)

df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
categorical = ['PU_DO']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

X_val = preprocess(df_val, dv)

In [225]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [226]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

## Optuna

In [227]:
import math
import optuna
import pathlib
from optuna.samplers import TPESampler
from mlflow.models.signature import infer_signature
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor


## objetivo GB

In [None]:
def objective_gb(trial: optuna.trial.Trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", math.exp(-7), 0.3, log=True),
        "max_leaf_nodes": trial.suggest_int("max_leaf_nodes", 10, 80),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "random_state": 42,
    }

    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "gradient_boosting")
        mlflow.log_params(params)

        model = GradientBoostingRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

        signature = infer_signature(X_val, y_pred)
        mlflow.sklearn.log_model(model, "model", input_example=X_val[:5], signature=signature)

    return rmse

In [None]:
mlflow.sklearn.autolog(log_models=False)

sampler = TPESampler(seed=42)
study_gb = optuna.create_study(direction="minimize", sampler=sampler)

with mlflow.start_run(run_name="GradientBoost Hyperparameter Optimization (Optuna)", nested=True):
    study_gb.optimize(objective_gb, n_trials=10)

    best_params = study_gb.best_params

    mlflow.log_params(best_params)

    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "gradient_boosting",
        "feature_set_version": 1,
    })

    final_model = GradientBoostingRegressor(**best_params)
    final_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)
    signature = infer_signature(input_example, y_val[:5])

    mlflow.sklearn.log_model(final_model, "model", input_example=input_example, signature=signature)

[I 2025-10-29 17:43:47,580] A new study created in memory with name: no-name-8ad069aa-e3b4-4466-bbf3-bdb7a0161403


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:45:18 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run able-carp-560 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/dd755a030cfb408d99c391e2139149ad
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042


[I 2025-10-29 17:45:23,872] Trial 0 finished with value: 6.496399816815654 and parameters: {'learning_rate': 0.007993270448118463, 'max_leaf_nodes': 77, 'max_depth': 10, 'min_samples_leaf': 12}. Best is trial 0 with value: 6.496399816815654.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:45:48 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run powerful-shrike-845 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/d2c2bed14c9740a9930060356d9486c2
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042


[I 2025-10-29 17:45:53,462] Trial 1 finished with value: 8.125801155365895 and parameters: {'learning_rate': 0.0022525064230539864, 'max_leaf_nodes': 21, 'max_depth': 3, 'min_samples_leaf': 18}. Best is trial 0 with value: 6.496399816815654.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:46:11 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:46:13,749] Trial 2 finished with value: 5.716773865305269 and parameters: {'learning_rate': 0.029720416526464566, 'max_leaf_nodes': 60, 'max_depth': 3, 'min_samples_leaf': 20}. Best is trial 2 with value: 5.716773865305269.


🏃 View run awesome-shoat-180 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/42363216d3ef4103b01cf8870625bae6
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:46:30 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:46:35,932] Trial 3 finished with value: 5.538795759490097 and parameters: {'learning_rate': 0.11359227064780915, 'max_leaf_nodes': 25, 'max_depth': 4, 'min_samples_leaf': 4}. Best is trial 3 with value: 5.538795759490097.


🏃 View run welcoming-shrike-691 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/37b5819c899f421faf217a720d0d1a10
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:47:14 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:47:18,492] Trial 4 finished with value: 7.059639544338837 and parameters: {'learning_rate': 0.005318288777514629, 'max_leaf_nodes': 47, 'max_depth': 7, 'min_samples_leaf': 6}. Best is trial 3 with value: 5.538795759490097.


🏃 View run legendary-grouse-982 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/b67875be0ad74656bba76cdae3bb5124
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:47:38 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:47:40,669] Trial 5 finished with value: 5.601087544815117 and parameters: {'learning_rate': 0.03162890116844962, 'max_leaf_nodes': 19, 'max_depth': 5, 'min_samples_leaf': 8}. Best is trial 3 with value: 5.538795759490097.


🏃 View run silent-newt-328 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/769bae520e2540a29136db3eb1489f7a
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:47:58 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:48:00,987] Trial 6 finished with value: 6.066710078751305 and parameters: {'learning_rate': 0.012821831585029154, 'max_leaf_nodes': 65, 'max_depth': 4, 'min_samples_leaf': 11}. Best is trial 3 with value: 5.538795759490097.


🏃 View run righteous-frog-4 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/3ede0ec7d9534ef1b03c0f293fa08fe3
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:48:18 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:48:21,153] Trial 7 finished with value: 5.656789400850453 and parameters: {'learning_rate': 0.02825883723693529, 'max_leaf_nodes': 13, 'max_depth': 9, 'min_samples_leaf': 4}. Best is trial 3 with value: 5.538795759490097.


🏃 View run persistent-bat-947 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/ddb7a33d49144fcdab5ba07d8d9542fb
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:48:53 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:48:56,258] Trial 8 finished with value: 8.43511209542287 and parameters: {'learning_rate': 0.001329490892086439, 'max_leaf_nodes': 77, 'max_depth': 12, 'min_samples_leaf': 17}. Best is trial 3 with value: 5.538795759490097.


🏃 View run selective-cod-140 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/651fc0d1adaa440096adf964ae21d061
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:49:16 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:49:18,728] Trial 9 finished with value: 7.11299850468749 and parameters: {'learning_rate': 0.00532975339252159, 'max_leaf_nodes': 16, 'max_depth': 9, 'min_samples_leaf': 9}. Best is trial 3 with value: 5.538795759490097.


🏃 View run merciful-crane-16 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/3bdae6f2a8634648aed8c6bf475a277e
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:50:24 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run GradientBoost Hyperparameter Optimization (Optuna) at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/2bda52049fb34f2296200f8767ad4d5e
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042


## objetivo RF

In [None]:
def objective_rf(trial: optuna.trial.Trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 30, 80),
        "max_depth": trial.suggest_int("max_depth", 5, 40),
        "min_samples_split": trial.suggest_int("min_samples_split", 5, 20),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None])
    }

    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "random_forest")
        mlflow.log_params(params)

        model = RandomForestRegressor(**params, n_jobs=-1, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)

        signature = infer_signature(X_val, y_pred)
        mlflow.sklearn.log_model(model, "model", input_example=X_val[:5], signature=signature)

    return rmse

In [None]:
mlflow.sklearn.autolog(log_models=False)

study_rf = optuna.create_study(direction="minimize", sampler=TPESampler(seed=42))
with mlflow.start_run(run_name="RandomForest Hyperparameter Optimization (Optuna)", nested=True):
    study_rf.optimize(objective_rf, n_trials=10)

    best_params_rf = study_rf.best_params

    mlflow.log_params(best_params_rf)

    mlflow.set_tags({
        "project": "NYC Taxi Time Prediction Project",
        "optimizer_engine": "optuna",
        "model_family": "random_forest",
        "feature_set_version": 1,
    })

    mlflow.sklearn.autolog(log_models=False)

    final_model = RandomForestRegressor(**best_params_rf, n_jobs=-1, random_state=42)
    final_model.fit(X_train, y_train)
    y_pred = final_model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    pathlib.Path("preprocessor").mkdir(exist_ok=True)
    with open("preprocessor/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("preprocessor/preprocessor.b", artifact_path="preprocessor")

    feature_names = dv.get_feature_names_out()
    input_example = pd.DataFrame(X_val[:5].toarray(), columns=feature_names)
    signature = infer_signature(input_example, y_val[:5])

    mlflow.sklearn.log_model(final_model, "model", input_example=input_example, signature=signature)

[I 2025-10-29 17:50:34,549] A new study created in memory with name: no-name-a8b08761-37e4-4ad8-aa28-a30faf17b879


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:50:52 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:50:57,261] Trial 0 finished with value: 7.192333875814888 and parameters: {'n_estimators': 49, 'max_depth': 39, 'min_samples_split': 16, 'max_features': 'sqrt'}. Best is trial 0 with value: 7.192333875814888.


🏃 View run handsome-squid-71 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/4f8f5dcd5e3341e3b5aecaf9aa3e6040
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:51:36 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:51:47,163] Trial 1 finished with value: 5.481438540048821 and parameters: {'n_estimators': 32, 'max_depth': 36, 'min_samples_split': 14, 'max_features': None}. Best is trial 1 with value: 5.481438540048821.


🏃 View run charming-pug-47 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/b6c324c8961249469b3515b7e9950abc
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:52:04 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:52:10,420] Trial 2 finished with value: 5.580631009827177 and parameters: {'n_estimators': 72, 'max_depth': 12, 'min_samples_split': 7, 'max_features': None}. Best is trial 1 with value: 5.481438540048821.


🏃 View run skillful-elk-543 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/6843faba5b4147828aed134369c1c48f
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:52:27 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:52:32,728] Trial 3 finished with value: 5.553659267170364 and parameters: {'n_estimators': 52, 'max_depth': 15, 'min_samples_split': 14, 'max_features': None}. Best is trial 1 with value: 5.481438540048821.


🏃 View run worried-penguin-719 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/5163cb8f7a824e649feb0f3d5635d49c
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:52:47 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:52:51,356] Trial 4 finished with value: 8.502400742570257 and parameters: {'n_estimators': 53, 'max_depth': 33, 'min_samples_split': 8, 'max_features': 'log2'}. Best is trial 1 with value: 5.481438540048821.


🏃 View run able-rat-204 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/d0fa224e3e3849af8c06e21188efd38d
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:53:04 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:53:07,152] Trial 5 finished with value: 8.822185495406224 and parameters: {'n_estimators': 60, 'max_depth': 11, 'min_samples_split': 6, 'max_features': 'log2'}. Best is trial 1 with value: 5.481438540048821.


🏃 View run adventurous-ant-505 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/4ea3e8bbf34a4e179cd0e2f34cc0627e
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:53:20 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:53:23,327] Trial 6 finished with value: 5.618797622479943 and parameters: {'n_estimators': 45, 'max_depth': 8, 'min_samples_split': 15, 'max_features': None}. Best is trial 1 with value: 5.481438540048821.


🏃 View run sassy-foal-640 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/b2a0a717f14e4641a723cef20f5d228a
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:53:37 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:53:41,925] Trial 7 finished with value: 7.1585742399315935 and parameters: {'n_estimators': 31, 'max_depth': 37, 'min_samples_split': 9, 'max_features': 'sqrt'}. Best is trial 1 with value: 5.481438540048821.


🏃 View run classy-tern-873 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/5930b6a4f2ea482fadaa811502034eb8
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:53:55 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:53:57,488] Trial 8 finished with value: 8.811958929776562 and parameters: {'n_estimators': 57, 'max_depth': 11, 'min_samples_split': 20, 'max_features': 'log2'}. Best is trial 1 with value: 5.481438540048821.


🏃 View run secretive-donkey-396 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/d713c337dc79450395c4e312a2156e73
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:54:20 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.
[I 2025-10-29 17:54:42,608] Trial 9 finished with value: 5.489059947767261 and parameters: {'n_estimators': 60, 'max_depth': 38, 'min_samples_split': 6, 'max_features': None}. Best is trial 1 with value: 5.481438540048821.


🏃 View run wise-snake-768 at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/dae65bf704e948cbbd83db373afc820a
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042




Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025/10/29 17:55:08 INFO mlflow.models.model: Found the following environment variables used during model inference: [DATABRICKS_HOST, DATABRICKS_TOKEN]. Please check if you need to set them when deploying the model. To disable this message, set environment variable `MLFLOW_RECORD_ENV_VARS_IN_MODEL_LOGGING` to `false`.


🏃 View run RandomForest Hyperparameter Optimization (Optuna) at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042/runs/875f5296d77a4fdcb24212b19fd5cb41
🧪 View experiment at: https://dbc-35a009eb-7ad0.cloud.databricks.com/ml/experiments/3999049734856042


## registro

In [232]:
model_name = "workspace.default.nyc-taxi-model"

In [233]:
runs = mlflow.search_runs(
    experiment_names=[EXPERIMENT_NAME],
    order_by=["metrics.rmse ASC"],
    output_format="list"
)

# Obtener el mejor run
if len(runs) > 0:
    best_run = runs[0]
    print("🏆 Champion Run encontrado:")
    print(f"Run ID: {best_run.info.run_id}")
    print(f"Validation RMSE: {best_run.data.metrics.get('rmse')}")
    print(f"Params: {best_run.data.params}")
else:
    print("⚠️ No se encontraron runs con métrica RMSE.")

🏆 Champion Run encontrado:
Run ID: b1e94feef25e4e2fa49cc01e302b3efb
Validation RMSE: 5.350864717798953
Params: {'custom_metric': 'None', 'early_stopping_rounds': '10', 'learning_rate': '0.07565903471570516', 'max_depth': '63', 'maximize': 'None', 'min_child_weight': '2.2802382585441565', 'num_boost_round': '100', 'objective': 'reg:squarederror', 'reg_alpha': '0.021678779375600917', 'reg_lambda': '0.015480241912324163', 'seed': '42', 'verbose_eval': 'True'}


In [234]:
run_id = best_run.info.run_id

In [235]:
result = mlflow.register_model(
    model_uri=f"runs:/{best_run.info.run_id}/model",
    name=model_name
)


Registered model 'workspace.default.nyc-taxi-model' already exists. Creating a new version of this model...


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

Created version '13' of model 'workspace.default.nyc-taxi-model'.


## alias

In [236]:
from mlflow import MlflowClient

client = MlflowClient()

In [237]:
model_version = result.version
new_alias = "Challenger"

client.set_registered_model_alias(
    name=model_name,
    alias=new_alias,
    version=result.version
)

In [238]:
from datetime import datetime

date = datetime.today()

client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=[], creation_timestamp=1761782155235, current_stage=None, deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description=('The model version 13 was transitioned to Challenger on 2025-10-29 '
 '17:56:19.400383'), last_updated_timestamp=1761782179258, metrics=[<Metric: dataset_digest='', dataset_name='', key='best_iteration', model_id='m-5746a243e69c4ab996b82f3d19df9f3e', run_id='b1e94feef25e4e2fa49cc01e302b3efb', step=0, timestamp=1761709214676, value=96.0>,
 <Metric: dataset_digest='', dataset_name='', key='rmse', model_id='m-5746a243e69c4ab996b82f3d19df9f3e', run_id='b1e94feef25e4e2fa49cc01e302b3efb', step=0, timestamp=1761709232081, value=5.350864717798953>,
 <Metric: dataset_digest='', dataset_name='', key='stopped_iteration', model_id='m-5746a243e69c4ab996b82f3d19df9f3e', run_id='b1e94feef25e4e2fa49cc01e302

## validación marzo

In [239]:
df_val = read_dataframe('C:/Users/prisc/apps/nyc-taxi-predictions-2025/data/green_tripdata_2025-03.parquet')

In [240]:
client = MlflowClient()

model_name = "workspace.default.nyc-taxi-model"  
# obtener la versión asociada al alias 'Champion' o 'Challenger'
champ = client.get_model_version_by_alias(model_name, "Champion")
chall = client.get_model_version_by_alias(model_name, "Challenger")
print("Champion:", champ.version, "run_id:", champ.run_id)
print("Challenger:", chall.version, "run_id:", chall.run_id)

Champion: 12 run_id: b1e94feef25e4e2fa49cc01e302b3efb
Challenger: 13 run_id: b1e94feef25e4e2fa49cc01e302b3efb


In [241]:
# Cargar el preprocesador
with open("preprocessor/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

# Preprocesar el dataset de marzo
X_march = preprocess(df_val, dv)
y_march = df_val['duration'].values
X_march_arr = X_march.toarray()

# Cargar Champion
champion_uri = f"models:/{model_name}@Champion"
champion_model = mlflow.pyfunc.load_model(champion_uri)

# Predecir
champion_preds = champion_model.predict(X_march_arr)
champion_rmse = root_mean_squared_error(y_march, champion_preds)
print(f"Champion RMSE (marzo): {champion_rmse:.6f}")

Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]

  model.load_model(xgb_model_path)


Champion RMSE (marzo): 23.951918


In [None]:
#de challenger a champion
client = MlflowClient()

client.set_registered_model_alias(
    name="workspace.default.nyc-taxi-model",
    alias="Champion",
    version=10
)