**Q1 --> Install MLflow**

In [2]:
!poetry run mlflow --version

mlflow, version 2.22.1


**Q2 --> Download and preprocess the data**

In [13]:
import os
import pandas as pd

DATA_DIR = f'{os.getcwd()}/data'

# download data
parquet_files = [
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet"
]


for file in parquet_files:
    print(f'File to download -> {file}')
    parquet_name = os.path.basename(file)
    df = pd.read_parquet(file)

    path = f'{DATA_DIR}/input/{parquet_name}'
    df.to_parquet(path)
    print(f'File downloaded successfully in -> data/input/{parquet_name}')

File to download -> https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
File downloaded successfully in -> data/input/green_tripdata_2023-01.parquet
File to download -> https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet
File downloaded successfully in -> data/input/green_tripdata_2023-02.parquet
File to download -> https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet
File downloaded successfully in -> data/input/green_tripdata_2023-03.parquet


In [18]:
TAXI_DATA_FOLDER = f'{DATA_DIR}/input'
DEST_PATH = f'{DATA_DIR}/output'

In [19]:
!poetry run python preprocess_data.py --raw_data_path {TAXI_DATA_FOLDER} --dest_path {DEST_PATH}

In [21]:
os.listdir(DEST_PATH)

['dv.pkl', 'test.pkl', 'train.pkl', 'val.pkl']

**Q3 --> Train a model with autolog**

In [23]:
EXPERIMENT_NAME = 'homework-2-rf-regression'

In [25]:
!poetry run python train.py --data_path {DEST_PATH} --experiment_name {EXPERIMENT_NAME}

RMSE --> 5.431162180141208




In [38]:
import mlflow


experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
experiment_id = experiment.experiment_id

runs_df = mlflow.search_runs(experiment_ids=[experiment_id])
run_id = runs_df.loc[0, "run_id"]

run = mlflow.get_run(run_id)
print(f"Min samples split parameter -> {run.data.params['min_samples_split']}")

Min samples split parameter -> 2


**Q4 --> Launch the tracking server locally**

In [None]:
!poetry run mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root file:./artifacts --port 5000

**Q5 --> Tune model hyperparameters**

In [43]:
!poetry run python hpo.py --data_path {DEST_PATH}


  0%|          | 0/15 [00:00<?, ?trial/s, best loss=?]
                                                      
🏃 View run popular-toad-709 at: http://127.0.0.1:5000/#/experiments/3/runs/1fea3372fa1149129382f43d0510b099


  0%|          | 0/15 [00:12<?, ?trial/s, best loss=?]
                                                      
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3


  0%|          | 0/15 [00:12<?, ?trial/s, best loss=?]
  7%|▋         | 1/15 [00:12<02:55, 12.54s/trial, best loss: 5.370086069268862]
                                                                               
🏃 View run intelligent-ray-551 at: http://127.0.0.1:5000/#/experiments/3/runs/41a3ce154cb14c609116a92a583ca5d4


  7%|▋         | 1/15 [00:13<02:55, 12.54s/trial, best loss: 5.370086069268862]
                                                                               
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/3


  7%|▋         | 1/15 [00:13<02:55, 12.54s/trial, best lo

  import pkg_resources
2025/06/10 00:23:41 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.


In [51]:
mlflow.set_tracking_uri("http://127.0.0.1:5001")

In [76]:
experiment = mlflow.get_experiment_by_name('random-forest-hyperopt')
experiment_id = experiment.experiment_id

runs_df = mlflow.search_runs(experiment_ids=[experiment_id])


best_run = runs_df.sort_values(by='metrics.rmse').head(1)
run_id = best_run.reset_index(drop=True).loc[0, "run_id"]
run = mlflow.get_run(run_id)

print(f'Parameters -> {run.data.params}')
print(f'RMSE calculated -> {run.data.metrics}')

Parameters -> {'max_depth': '19', 'min_samples_leaf': '2', 'min_samples_split': '2', 'n_estimators': '11', 'random_state': '42'}
RMSE calculated -> {'rmse': 5.335419588556921}


**Q6 --> Promote the best model to the model registry**

In [81]:
!poetry run python register_model.py --data_path {DEST_PATH}

🏃 View run gifted-moose-826 at: http://127.0.0.1:5001/#/experiments/4/runs/ccd86de70e6e48b7b18442c268bf4aa6
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/4
🏃 View run dapper-lark-591 at: http://127.0.0.1:5001/#/experiments/4/runs/d656e812f228403a8ff0a2f2c91867f9
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/4
🏃 View run handsome-snake-877 at: http://127.0.0.1:5001/#/experiments/4/runs/1c6cafef3d9148acb06dd421918100c3
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/4
🏃 View run thoughtful-hawk-667 at: http://127.0.0.1:5001/#/experiments/4/runs/30e9be00676946f99c4913324c8b173f
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/4
🏃 View run nosy-vole-536 at: http://127.0.0.1:5001/#/experiments/4/runs/903ab4b10a0842c6a6582efe1175b313
🧪 View experiment at: http://127.0.0.1:5001/#/experiments/4
Modelo registrado: random-forest-regressor, versión 1


2025/06/10 00:49:55 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.
Successfully registered model 'random-forest-regressor'.
2025/06/10 00:51:22 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random-forest-regressor, version 1
Created version '1' of model 'random-forest-regressor'.


In [87]:
experiment = mlflow.get_experiment_by_name('random-forest-best-models')
experiment_id = experiment.experiment_id

runs_df = mlflow.search_runs(experiment_ids=[experiment_id])


best_run = runs_df.sort_values(by='metrics.test_rmse').head(1)
run_id = best_run.reset_index(drop=True).loc[0, "run_id"]
run = mlflow.get_run(run_id)

print(f'Parameters -> {run.data.params}')
print(f'RMSE calculated -> {run.data.metrics['test_rmse']}')

Parameters -> {'bootstrap': 'True', 'ccp_alpha': '0.0', 'criterion': 'squared_error', 'max_depth': '19', 'max_features': '1.0', 'max_leaf_nodes': 'None', 'max_samples': 'None', 'min_impurity_decrease': '0.0', 'min_samples_leaf': '2', 'min_samples_split': '2', 'min_weight_fraction_leaf': '0.0', 'monotonic_cst': 'None', 'n_estimators': '11', 'n_jobs': 'None', 'oob_score': 'False', 'random_state': '42', 'verbose': '0', 'warm_start': 'False'}
RMSE calculated -> 5.567408012462019
