Q2

In [15]:
import os
import requests
import pandas as pd

# Create a directory to store the data
!mkdir -p taxi_data

urls = [
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet"
]

# Download the files
for url in urls:
    filename = url.split('/')[-1]
    response = requests.get(url)
    with open(f"taxi_data/{filename}", "wb") as file:
        file.write(response.content)
    print(f"Downloaded {filename}")

# Run the preprocessing script
!python preprocess_data.py --raw_data_path ./taxi_data --dest_path ./output

# Check the contents 
output_files = os.listdir('./output')
print(f"Files in output folder: {output_files}")
print(f"Number of files in output folder: {len(output_files)}")

Downloaded green_tripdata_2023-01.parquet
Downloaded green_tripdata_2023-02.parquet
Downloaded green_tripdata_2023-03.parquet
Files in output folder: ['dv.pkl', 'train.pkl', 'test.pkl', 'val.pkl']
Number of files in output folder: 4


Q3

In [4]:
!pip install mlflow





In [None]:
!python train.py

import mlflow
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Get experiments
experiments = client.search_experiments()

for exp in experiments:
    print(f"Experiment ID: {exp.experiment_id}, Name: {exp.name}")
    
    # Get runs for this experiment
    runs = client.search_runs(experiment_ids=[exp.experiment_id])
    
    if runs:
        latest_run = runs[0]
        if 'min_samples_split' in latest_run.data.params:
            min_samples_split = latest_run.data.params['min_samples_split']
            print(f"The value of min_samples_split is: {min_samples_split}")
            break
    else:
        print("No runs")

!mlflow ui

2024/11/03 13:13:18 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
Experiment ID: 247086600695916755, Name: random-forest-hyperopt
The value of min_samples_split is: 9
[2024-11-03 13:13:37 +0000] [12196] [INFO] Starting gunicorn 23.0.0
[2024-11-03 13:13:37 +0000] [12196] [INFO] Listening at: http://127.0.0.1:5000 (12196)
[2024-11-03 13:13:37 +0000] [12196] [INFO] Using worker: sync
[2024-11-03 13:13:37 +0000] [12197] [INFO] Booting worker with pid: 12197
[2024-11-03 13:13:37 +0000] [12198] [INFO] Booting worker with pid: 12198
[2024-11-03 13:13:37 +0000] [12199] [INFO] Booting worker with pid: 12199
[2024-11-03 13:13:37 +0000] [12200] [INFO] Booting worker with pid: 12200


Q5

In [5]:
pip install numpy scikit-learn hyperopt mlflow click

Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 16.6 MB/s eta 0:00:01
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 80.0 MB/s eta 0:00:01


Installing collected packages: py4j, hyperopt
Successfully installed hyperopt-0.2.7 py4j-0.10.9.7
Note: you may need to restart the kernel to use updated packages.


In [12]:
!python hpo.py

INFO:__main__:Python version: 3.9.12 (main, Apr  5 2022, 06:56:58) 
[GCC 7.5.0]
INFO:__main__:MLflow version: 2.17.2
INFO:__main__:MLflow tracking URI: http://localhost:5002
INFO:__main__:Experiment 'random-forest-hyperopt' already exists with ID: 1
INFO:__main__:Data path: ./output
INFO:__main__:Number of trials: 15
  0%|                                    | 0/15 [00:00<?, ?trial/s, best loss=?]INFO:hyperopt.tpe:build_posterior_wrapper took 0.001029 seconds
INFO:hyperopt.tpe:TPE using 0 trials
INFO:__main__:RMSE: 5.370086069268862
2024/11/03 13:40:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run efficient-quail-74 at: http://localhost:5002/#/experiments/1/runs/1eab2c0ba0c04ab0971ad665805d040e.

2024/11/03 13:40:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5002/#/experiments/1.

  7%|▋          | 1/15 [00:07<01:43,  7.38s/trial, best loss: 5.370086069268862]INFO:hyperopt.tpe:build_posterior_wrapper took 0.001280 seconds
INFO:hyper

INFO:__main__:RMSE: 5.355041749098929
2024/11/03 13:41:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run bedecked-slug-860 at: http://localhost:5002/#/experiments/1/runs/62fce371e9f24e028fdbf745fcd7dbce.

2024/11/03 13:41:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5002/#/experiments/1.

100%|██████████| 15/15 [00:58<00:00,  3.87s/trial, best loss: 5.335419588556921]
2024/11/03 13:41:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run worried-hare-633 at: http://localhost:5002/#/experiments/1/runs/f5c709f53f7d4c99ac1665db148a5e2f.
2024/11/03 13:41:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5002/#/experiments/1.
Best RMSE: 5.335419588556921


Q6

In [2]:
import subprocess

# Start MLflow server
server_process = subprocess.Popen(["mlflow", "server", "--backend-store-uri", "sqlite:///mlflow.db", "--default-artifact-root", "./artifacts", "--host", "0.0.0.0", "--port", "5001"])

# Your MLflow script here
# ...

# Don't forget to stop the server when you're done
#server_process.terminate()

[2024-11-03 17:37:40 +0000] [11457] [INFO] Starting gunicorn 23.0.0
[2024-11-03 17:37:40 +0000] [11457] [INFO] Listening at: http://0.0.0.0:5001 (11457)
[2024-11-03 17:37:40 +0000] [11457] [INFO] Using worker: sync
[2024-11-03 17:37:40 +0000] [11458] [INFO] Booting worker with pid: 11458
[2024-11-03 17:37:40 +0000] [11459] [INFO] Booting worker with pid: 11459
[2024-11-03 17:37:40 +0000] [11460] [INFO] Booting worker with pid: 11460
[2024-11-03 17:37:40 +0000] [11461] [INFO] Booting worker with pid: 11461


In [5]:
!python register_model.py --data_path ./output --top_n 5

2024/11/03 17:41:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run rumbling-koi-597 at: http://127.0.0.1:5001/#/experiments/2/runs/a85032cafc814ba89803c9f61603a1b9.
2024/11/03 17:41:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/2.
2024/11/03 17:41:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run glamorous-sheep-972 at: http://127.0.0.1:5001/#/experiments/2/runs/72d38c1116af44b8b65a725acaf7775c.
2024/11/03 17:41:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/2.
2024/11/03 17:42:02 INFO mlflow.tracking._tracking_service.client: 🏃 View run unequaled-ox-631 at: http://127.0.0.1:5001/#/experiments/2/runs/ecc05384d7254cf1b4bea7ee3dad29c0.
2024/11/03 17:42:02 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5001/#/experiments/2.
2024/11/03 17:42:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run sneaky-