In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso
from sklearn.metrics import root_mean_squared_error
import mlflow
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR
import os
import pickle

### Preparar datos y definir features

In [2]:
def read_dataframe(path):
    df = pd.read_parquet(path)
    df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)
    return df

df_train = read_dataframe("../data/green_tripdata_2025-01.parquet")
df_val = read_dataframe("../data/green_tripdata_2025-02.parquet")

df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

categorical = ["PU_DO"]
numerical = ["trip_distance"]

dv = DictVectorizer()
X_train = dv.fit_transform(df_train[categorical + numerical].to_dict(orient="records"))
X_val = dv.transform(df_val[categorical + numerical].to_dict(orient="records"))

y_train = df_train["duration"].values
y_val = df_val["duration"].values

### Tracking con MLFlow

In [None]:
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

#  Use filesystem backend instead of SQLite to avoid migration issues
mlflow.set_tracking_uri("./mlruns")

# Crear o usar el experimento local
mlflow.set_experiment("class-nyc-taxi-experiment")

# Verificar configuración
print("Tracking URI:", mlflow.get_tracking_uri())
print("Experiment info:", mlflow.get_experiment_by_name("class-nyc-taxi-experiment"))

# Ejecutar un experimento con MLflow
with mlflow.start_run(run_name="lasso_alpha_0.1"):
    alpha = 0.1
    mlflow.log_param("alpha", alpha)

    model = Lasso(alpha=alpha)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)

    # Registrar resultados en MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(model, "model")

    print(f"Run finalizado. RMSE = {rmse:.4f}")

### Clase 2: Segundo registro de experimentos

In [4]:
mlflow.set_tracking_uri("./mlruns")
mlflow.set_experiment("nyc-taxi-model-registry-example")


<Experiment: artifact_location=('file:///c:/Users/samys/Documents/Ingeniería y Ciencia de Datos/Quinto '
 'Semestre/Proyecto Ciencia de '
 'Datos/Repositorios/nyc-taxi-predictions-2025/notebooks/mlruns/737742726769045930'), creation_time=1760662432626, experiment_id='737742726769045930', last_update_time=1760662432626, lifecycle_stage='active', name='nyc-taxi-model-registry-example', tags={}>

In [5]:
mlflow.sklearn.autolog()

In [6]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

### Nested Runs

In [7]:
models = [
    
    {"model": GradientBoostingRegressor,
     "params": {"n_estimators": 100, "learning_rate": 0.3, "max_depth": 25, "random_state": 42},
     },
    
    {"model": ExtraTreesRegressor,
     "params": {"n_estimators": 100, "max_depth": 15, "random_state": 42},
     },
    
    {"model": LinearSVR,
     "params": {"C": 1.0, "epsilon": 0}, 
     },

]

In [8]:
# Asegurarte de que mlruns esté dentro de notebooks
mlflow.set_tracking_uri("./mlruns")
mlflow.set_experiment("class-nyc-taxi-experiment")

with mlflow.start_run(run_name="Nested Runs"):
    for model in models:
        model_class = model["model"]
        model_name = model_class.__name__
        params = model["params"]

        with mlflow.start_run(run_name=model_name, nested=True):
            # Entrenamiento del modelo
            ml_model = model_class(**params)
            ml_model.fit(X_train, y_train)

            # Predicción y cálculo del RMSE
            y_pred = ml_model.predict(X_val)
            rmse = root_mean_squared_error(y_val, y_pred)
            mlflow.log_metric("rmse", rmse)

            # Crear carpeta 'models' dentro de notebooks si no existe
            os.makedirs("models", exist_ok=True)

            # Guardar el preprocesador (dv) en esa carpeta
            preproc_path = "models/preprocessor.b"
            with open(preproc_path, "wb") as f_out:
                pickle.dump(dv, f_out)

            # Registrar el preprocesador como artefacto en MLflow
            mlflow.log_artifact(preproc_path, artifact_path="preprocessor")

            print(f"Run '{model_name}' finalizado. RMSE = {rmse:.4f}")




Run 'GradientBoostingRegressor' finalizado. RMSE = 5.4560




Run 'ExtraTreesRegressor' finalizado. RMSE = 8.4630




Run 'LinearSVR' finalizado. RMSE = 510.8559


-------------------------------------------------------------------------------------------------------------

### Registrar modelos a través de código

In [22]:
from sklearn.ensemble import RandomForestRegressor

# Configuramos MLflow para que use una ruta corta en C:
mlflow.set_tracking_uri("file:///C:/mlruns")  # <-- ruta corta para artefactos

# Iniciamos un experimento (opcional, nombre corto también)
mlflow.set_experiment("nyc-taxi-experiment")

# Entrenamos el modelo
ml_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    random_state=42
)
ml_model.fit(X_train, y_train)

# Logueamos el modelo en MLflow usando un nombre corto
mlflow.sklearn.log_model(
    sk_model=ml_model,              # tu modelo entrenado
    artifact_path="model",           # carpeta dentro de mlruns
    registered_model_name="nyc_taxi-model" )

# Hacemos predicciones y calculamos RMSE
y_pred = ml_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {rmse:.4f}")


2025/10/16 20:52:08 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'afe69b28ebbc435983d97a74443e55ce', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
Successfully registered model 'nyc_taxi-model'.
Created version '1' of model 'nyc_taxi-model'.


RMSE: 5.5688


In [None]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

In [16]:
from mlflow import MlflowClient

client = MlflowClient(tracking_uri="file:///C:/mlruns")

In [None]:
client.create_registered_model(name="nyc-taxi-model")

In [18]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = client.create_model_version(
    name="nyc-taxi-model",
    source=run_uri,
    run_id=run_id
)

## Asignar aliases y descripciones

In [None]:
from mlflow import MlflowClient

client = MlflowClient(tracking_uri="file:///C:/mlruns")

# create "champion" alias for version 1 of model "example-model"
client.set_registered_model_alias(
    name="nyc-taxi-model", 
    alias="champion",
    version=10
)

# set the "challenger" alias to version 2
client.set_registered_model_alias(
    name="nyc-taxi-model", 
    alias="challenger",
    version=7
)

In [None]:
# get a model version by alias
client.get_model_version_by_alias(
    name="nyc-taxi-model",
    alias="champion"
)

In [None]:
# delete the alias
client.delete_registered_model_alias(
    name="nyc-taxi-model", 
    alias="challenger"
)

In [None]:
client.update_model_version(
    name="nyc-taxi-model",
    version=1,
    description="This model version is a scikit-learn random forest containing 100 decision trees",
)

## Obtener modelos

In [None]:
# Por registros

import mlflow.pyfunc

model_name = "nyc-taxi-model"
model_version = 1

model_uri = f"models:/{model_name}/{model_version}"

model = mlflow.pyfunc.load_model(
    model_uri=model_uri,
)

model.predict(X_val)

In [None]:
# Por alias

import mlflow.pyfunc

model_name = "nyc-taxi-model"
alias = "champion"

model_uri = f"models:/{model_name}@{alias}"

champion_version = mlflow.pyfunc.load_model(
    model_uri=model_uri
)

champion_version.predict(X_val)

## Comparación de versiones y elección de un nuevo modelo champion

In [None]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, alias, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}@{alias}")
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error(y_test, y_pred)}

In [None]:
df = read_dataframe("data/green_tripdata_2025-03.parquet")

In [None]:
run_id = input("Ingrese el run_id")

client.download_artifacts(
    run_id=run_id, 
    path='preprocessor', 
    dst_path='.'
)