In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso
from sklearn.metrics import root_mean_squared_error

### Preparar datos y definir features

In [2]:
def read_dataframe(path):
    df = pd.read_parquet(path)
    df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)
    return df

df_train = read_dataframe("../data/green_tripdata_2025-01.parquet")
df_val = read_dataframe("../data/green_tripdata_2025-02.parquet")

df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

categorical = ["PU_DO"]
numerical = ["trip_distance"]

dv = DictVectorizer()
X_train = dv.fit_transform(df_train[categorical + numerical].to_dict(orient="records"))
X_val = dv.transform(df_val[categorical + numerical].to_dict(orient="records"))

y_train = df_train["duration"].values
y_val = df_val["duration"].values

### Tracking con MLFlow

In [5]:
import mlflow
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import numpy as np

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# SOLUTION 3: Use filesystem backend instead of SQLite to avoid migration issues
mlflow.set_tracking_uri("./mlruns")

# Crear o usar el experimento local
mlflow.set_experiment("class-nyc-taxi-experiment")

# Verificar configuración
print("Tracking URI:", mlflow.get_tracking_uri())
print("Experiment info:", mlflow.get_experiment_by_name("class-nyc-taxi-experiment"))

# Make sure you have X_train, y_train, X_val, y_val defined
# If not, you'll need to add your data loading and preprocessing code here

# Ejecutar un experimento con MLflow
with mlflow.start_run(run_name="lasso_alpha_0.1"):
    alpha = 0.1
    mlflow.log_param("alpha", alpha)

    model = Lasso(alpha=alpha)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)

    # Registrar resultados en MLflow
    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(model, "model")

    print(f"Run finalizado. RMSE = {rmse:.4f}")


Tracking URI: ./mlruns
Experiment info: <Experiment: artifact_location=('file:///c:/Users/samys/Documents/Ingeniería y Ciencia de Datos/Quinto '
 'Semestre/Proyecto Ciencia de '
 'Datos/Repositorios/nyc-taxi-predictions-2025/notebooks/mlruns/564663416245393530'), creation_time=1760661099517, experiment_id='564663416245393530', last_update_time=1760661099517, lifecycle_stage='active', name='class-nyc-taxi-experiment', tags={}>




Run finalizado. RMSE = 8.9926
