# Q1. Install MLflow

In [1]:
# Q1
! mlflow --version

mlflow, version 2.13.0


# Q2. Download and preprocess the data

In [2]:
links = [
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet"
]

# download data to data subdirectory replace if exists
for link in links:
    ! wget -N -P data $link

--2024-05-24 14:46:44--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.161.3.141, 18.161.3.209, 18.161.3.119, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.161.3.141|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘data/green_tripdata_2023-01.parquet’ not modified on server. Omitting download.

--2024-05-24 14:46:45--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.161.3.209, 18.161.3.141, 18.161.3.58, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.161.3.209|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘data/green_tripdata_2023-02.parquet’ not modified on server. Omitting download.

--2024-05-24 14:46:46--  https://d37ci6vzurychx

In [3]:
# download preprocess_data.py
! wget -N https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/preprocess_data.py

--2024-05-24 14:46:49--  https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/preprocess_data.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2510 (2.5K) [text/plain]
Saving to: ‘preprocess_data.py’


Last-modified header missing -- time-stamps turned off.
2024-05-24 14:46:49 (38.5 MB/s) - ‘preprocess_data.py’ saved [2510/2510]



In [4]:
! python preprocess_data.py --raw_data_path data --dest_path ./output

In [5]:
# Q2
# number of files in output directory
import os
output_dir = './output'
num_files = len([name for name in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, name))])
print(f"Number of files in output directory: {num_files}")

Number of files in output directory: 4


# Q3. Train a model with autolog

In [6]:
# download train.py
# ! wget -N https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/train.py

In [7]:
# run modified train.py
! python train.py

2024/05/24 14:47:06 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/05/24 14:47:06 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

In [10]:
# Q3
# get min_samples_split from last run
import mlflow
import pandas as pd

# Get last run
experiments = mlflow.search_runs()
last_run = experiments[experiments['status'] == 'FINISHED'].sort_values(by='end_time', ascending=False).head(1)

# Get run_id
run_id = last_run['run_id'].values[0]

# Get run
run = mlflow.get_run(run_id)

# Get run data
data = run.data

# Get params
params = data.params

# Get min_samples_split
min_samples_split = params['min_samples_split']
print(f"min_samples_split: {min_samples_split}")

min_samples_split: 2


# Q4. Launch the tracking server locally

Launch mlflow locally select a `hw2.sqlite` db for the backend store and
a folder called `artifacts` for the artifacts store
```bash
mlflow ui --backend-store-uri sqlite:///hw2.sqlite --artifacts-destination ./artifacts
```
References:
https://stackoverflow.com/questions/75057477/mlflow-server-difference-between-default-artifact-root-and-artifacts
https://mlflow.org/docs/latest/cli.html#mlflow-artifacts

In [11]:
# ! mlflow ui --backend-store-uri sqlite:///hw2.sqlite --artifacts-destination ./artifacts

In [12]:
# Q4 answer: artifacts-destination

# Q5. Tune model hyperparameters

In [13]:
# download hpo.py
# ! wget -N https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/hpo.py

In [15]:
# Q5
! python hpo.py

100%|██████████| 15/15 [01:00<00:00,  4.01s/trial, best loss: 5.335419588556921]


# Q6. Promote the best model to the model registry

In [21]:
# download register_model.py
# ! wget -N https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/register_model.py

In [16]:
# Q6
! python register_model.py

2024/05/24 14:54:14 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-model' does not exist. Creating a new experiment.
Model ID: d54dd1936e5740cea9822d52fa1fb343 | Test RMSE: 5.567
Successfully registered model 'best_run_model'.
2024/05/24 14:54:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: best_run_model, version 1
Created version '1' of model 'best_run_model'.
