In [54]:
!python -V

Python 3.13.9


In [55]:
!which python

/home/ubuntu/anaconda3/envs/.venv/bin/python


In [85]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

import mlflow
import os

# ----------------------------------------------------------------------
# 💡 CRITICAL: Ensure this matches your persistent MLflow server endpoint
# ----------------------------------------------------------------------
MLFLOW_TRACKING_URI = "http://127.0.0.1:5001" 
EXPERIMENT_NAME = "nyc-taxi-duration-prediction"

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)

print(f"MLflow Tracking URI set to: {mlflow.get_tracking_uri()}")
print(f"MLflow Experiment set to: {EXPERIMENT_NAME}")

2025/10/29 06:56:18 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-duration-prediction' does not exist. Creating a new experiment.


MLflow Tracking URI set to: http://127.0.0.1:5001
MLflow Experiment set to: nyc-taxi-duration-prediction


In [86]:
# Assuming your data is located here on the VM:
data_path_train = '~/notebooks/data/green_tripdata_2021-01.parquet'
data_path_val = '~/notebooks/data/green_tripdata_2021-02.parquet'

df_train = read_dataframe(data_path_train)
df_val = read_dataframe(data_path_val)

# Feature Engineering: Combine pickup and dropoff
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

categorical = ['PU_DO']
numerical = ['trip_distance']
target = 'duration'

# Prepare feature matrices
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
y_train = df_train[target].values

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val[target].values

print(f"Train matrix size: {X_train.shape[0]} rows, {X_train.shape[1]} features")
print(f"Validation matrix size: {X_val.shape[0]} rows, {X_val.shape[1]} features")

# Save DictVectorizer artifact locally for logging
with open('../models/dv.bin', 'wb') as f_out:
    pickle.dump(dv, f_out)

Train matrix size: 73908 rows, 13221 features
Validation matrix size: 61921 rows, 13221 features


In [87]:
# Assuming your data is located here on the VM:
data_path_train = '~/notebooks/data/green_tripdata_2021-01.parquet'
data_path_val = '~/notebooks/data/green_tripdata_2021-02.parquet'

df_train = read_dataframe(data_path_train)
df_val = read_dataframe(data_path_val)

# Feature Engineering: Combine pickup and dropoff
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

categorical = ['PU_DO']
numerical = ['trip_distance']
target = 'duration'

# Prepare feature matrices
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
y_train = df_train[target].values

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val[target].values

print(f"Train matrix size: {X_train.shape[0]} rows, {X_train.shape[1]} features")
print(f"Validation matrix size: {X_val.shape[0]} rows, {X_val.shape[1]} features")

# Save DictVectorizer artifact locally for logging
with open('../models/dv.bin', 'wb') as f_out:
    pickle.dump(dv, f_out)

Train matrix size: 73908 rows, 13221 features
Validation matrix size: 61921 rows, 13221 features


In [88]:
# Define Hyperparameters
ALPHA = 0.5
MODEL_NAME = "LassoRegression"

with mlflow.start_run(run_name=f"{MODEL_NAME}-alpha-{ALPHA}") as run:
    
    # --- MLflow Tags ---
    mlflow.set_tag("developer", "shirangi")
    mlflow.set_tag("model_type", MODEL_NAME)
    mlflow.set_tag("training_data_month", "2021-01")
    mlflow.set_tag("validation_data_month", "2021-02")

    # --- MLflow Parameters ---
    mlflow.log_param("alpha", ALPHA)
    mlflow.log_param("features", categorical + numerical)
    mlflow.log_param("target", target)
    # Log the full data paths for reproducibility
    mlflow.log_param("train-data-path", data_path_train) 
    mlflow.log_param("valid-data-path", data_path_val)

    # --- Training ---
    lr = Lasso(alpha=ALPHA)
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))

    # --- MLflow Metrics ---
    mlflow.log_metric("rmse", rmse)
    
    # Log feature count (useful metric for sparse models)
    mlflow.log_metric("feature_count", X_train.shape[1])
    
    # --- MLflow Artifacts ---
    # Log the DictVectorizer as an artifact
    mlflow.log_artifact(local_path="../models/dv.bin", artifact_path="preprocessor")

    # Log the model using the native MLflow format
    # This automatically includes the model code, environment, and dependencies.
    mlflow.sklearn.log_model(
        sk_model=lr, 
        artifact_path="model",
        # Log training set metrics and parameters
        input_example=X_train[0].toarray()[0],
        registered_model_name=MODEL_NAME # This registers the model if using a database backend (Scenario 2/3)
    )
    
    print(f"Lasso RMSE: {rmse}")
    print(f"Run ID: {run.info.run_id}")

  "inputs": [
    0.0,
    0.0,
    0.0,
    0.0.... Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: Expected 2D array, got 1D array instead:
array=[0.   0.   0.   ... 0.   0.   1.01].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Successfully registered model 'LassoRegression'.
2025/10/29 06:56:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LassoRegression, version 1
Created version '1' of model 'LassoRegression'.


Lasso RMSE: 12.212582619743364
Run ID: 6e55c89511ce4dd29915c6ea1ec53035
🏃 View run LassoRegression-alpha-0.5 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/6e55c89511ce4dd29915c6ea1ec53035
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553


In [91]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from datetime import datetime

# Prepare DMatrix objects (assuming this part is correct from previous steps)
# train = xgb.DMatrix(X_train, label=y_train)
# valid = xgb.DMatrix(X_val, label=y_val)

def objective(params):
    with mlflow.start_run(run_name="XGBoost-Tuning-" + datetime.now().strftime("%H%M%S")):
        
        # --- MLflow Tags ---
        mlflow.set_tag("developer", "shirangi")
        mlflow.set_tag("model_type", "xgboost")
        mlflow.set_tag("optimization_algorithm", "Hyperopt TPE")
        
        # Log parameters *before* training
        mlflow.log_params(params)
        
        # --- Training ---
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        
        y_pred = booster.predict(valid)
        
        # 💡 CORRECTED LINE: Use np.sqrt() instead of unsupported squared=False
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        # --- MLflow Metrics ---
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("best_iteration", booster.best_iteration)
        
        # --- MLflow Artifacts ---
        # Log model using the native MLflow format
        mlflow.xgboost.log_model(
            xgb_model=booster, 
            artifact_path="model", 
            registered_model_name="XGBoostModel"
        )

    return {'loss': rmse, 'status': STATUS_OK}

# Define the search space (kept the same)
search_space = hp.choice('hyperparameters', [
    {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 10, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
        'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
        'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
        'objective': 'reg:linear', # Note: XGBoost will issue a deprecation warning, but it's common in this context
        'seed': 42
    }
])

In [None]:
# Run the hyperparameter optimization
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50, # Number of trials to run
    trials=Trials()
)

print("\nBest Hyperopt Parameters:")
print(best_result)

  0%|                                                                                                                                                                                                       | 0/50 [00:00<?, ?trial/s, best loss=?]

  self.starting_round = model.num_boosted_rounds()


  xgb_model.save_model(model_data_path)


Successfully registered model 'XGBoostModel'.
2025/10/29 07:03:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostModel, version 1

Created version '1' of model 'XGBoostModel'.


🏃 View run XGBoost-Tuning-070310 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/ac9bf39d509d446c9a260195d313d22d                                                                                                                 

🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553                                                                                                                                                                      

  2%|███▍                                                                                                                                                                          | 1/50 [00:33<27:36, 33.80s/trial, best loss: 6.622337818536132]

  self.starting_round = model.num_boosted_rounds()


  xgb_model.save_model(model_data_path)


Registered model 'XGBoostModel' already exists. Creating a new version of this model...
2025/10/29 07:04:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostModel, version 2

Created version '2' of model 'XGBoostModel'.


🏃 View run XGBoost-Tuning-070343 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/2e50b73d77654cf889ef8a2b9493c74f                                                                                                                 

🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553                                                                                                                                                                      

  4%|██████▉                                                                                                                                                                       | 2/50 [01:05<26:17, 32.86s/trial, best loss: 6.607523656548425]

  self.starting_round = model.num_boosted_rounds()


  xgb_model.save_model(model_data_path)


Registered model 'XGBoostModel' already exists. Creating a new version of this model...
2025/10/29 07:04:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostModel, version 3

Created version '3' of model 'XGBoostModel'.


🏃 View run XGBoost-Tuning-070416 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/04f3f2c253bc4b2cbe423a17aacd2f33                                                                                                                 

🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553                                                                                                                                                                      

  6%|██████████▍                                                                                                                                                                   | 3/50 [01:44<27:46, 35.46s/trial, best loss: 6.607523656548425]

  self.starting_round = model.num_boosted_rounds()


  xgb_model.save_model(model_data_path)


Registered model 'XGBoostModel' already exists. Creating a new version of this model...
2025/10/29 07:05:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostModel, version 4

Created version '4' of model 'XGBoostModel'.


🏃 View run XGBoost-Tuning-070454 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/0e56ea5df09c4e8cb15e237150109be7                                                                                                                 

🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553                                                                                                                                                                      

  8%|█████████████▉                                                                                                                                                                | 4/50 [02:21<27:33, 35.94s/trial, best loss: 6.329518011051609]

  self.starting_round = model.num_boosted_rounds()


  xgb_model.save_model(model_data_path)


Registered model 'XGBoostModel' already exists. Creating a new version of this model...
2025/10/29 07:06:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostModel, version 5

Created version '5' of model 'XGBoostModel'.


🏃 View run XGBoost-Tuning-070531 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/f7dd5b8849624a168dec22cf5dd3ad80                                                                                                                 

🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553                                                                                                                                                                      

 10%|█████████████████▍                                                                                                                                                            | 5/50 [02:58<27:11, 36.25s/trial, best loss: 6.329518011051609]

  self.starting_round = model.num_boosted_rounds()


  xgb_model.save_model(model_data_path)


Registered model 'XGBoostModel' already exists. Creating a new version of this model...
2025/10/29 07:06:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostModel, version 6

Created version '6' of model 'XGBoostModel'.


🏃 View run XGBoost-Tuning-070608 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/bd5d692d956f43758afeb1c8055f85d8                                                                                                                 

🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553                                                                                                                                                                      

 12%|████████████████████▉                                                                                                                                                         | 6/50 [03:36<27:09, 37.04s/trial, best loss: 6.329518011051609]

  self.starting_round = model.num_boosted_rounds()


  xgb_model.save_model(model_data_path)


Registered model 'XGBoostModel' already exists. Creating a new version of this model...
2025/10/29 07:07:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostModel, version 7

Created version '7' of model 'XGBoostModel'.


🏃 View run XGBoost-Tuning-070646 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/008a1944006446b3a63e5e3c6c3b4208                                                                                                                 

🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553                                                                                                                                                                      

 14%|████████████████████████▎                                                                                                                                                     | 7/50 [04:14<26:49, 37.43s/trial, best loss: 6.329518011051609]

  self.starting_round = model.num_boosted_rounds()


  xgb_model.save_model(model_data_path)


Registered model 'XGBoostModel' already exists. Creating a new version of this model...
2025/10/29 07:07:56 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostModel, version 8

Created version '8' of model 'XGBoostModel'.


🏃 View run XGBoost-Tuning-070724 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/96f87a51339e43d98865b0cc4851cbca                                                                                                                 

🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553                                                                                                                                                                      

 16%|███████████████████████████▊                                                                                                                                                  | 8/50 [04:46<25:01, 35.74s/trial, best loss: 6.329518011051609]

  self.starting_round = model.num_boosted_rounds()


  xgb_model.save_model(model_data_path)


Registered model 'XGBoostModel' already exists. Creating a new version of this model...
2025/10/29 07:08:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostModel, version 9

Created version '9' of model 'XGBoostModel'.


🏃 View run XGBoost-Tuning-070757 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/5485c27e551e49efbcb31f02d38dfb34                                                                                                                 

🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553                                                                                                                                                                      

 18%|███████████████████████████████▎                                                                                                                                              | 9/50 [05:22<24:25, 35.74s/trial, best loss: 6.329518011051609]

  self.starting_round = model.num_boosted_rounds()


  xgb_model.save_model(model_data_path)


Registered model 'XGBoostModel' already exists. Creating a new version of this model...
2025/10/29 07:09:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoostModel, version 10

Created version '10' of model 'XGBoostModel'.


🏃 View run XGBoost-Tuning-070832 at: http://127.0.0.1:5001/#/experiments/127727476320843553/runs/154be7b78a524ba7a6a511b25b68c563                                                                                                                 

🧪 View experiment at: http://127.0.0.1:5001/#/experiments/127727476320843553                                                                                                                                                                      

 20%|██████████████████████████████████▌                                                                                                                                          | 10/50 [05:57<23:40, 35.51s/trial, best loss: 6.329518011051609]

  self.starting_round = model.num_boosted_rounds()

