# Q1. Install MLflow

In [1]:
! pip install mlflow

Collecting mlflow
  Downloading mlflow-2.13.0-py3-none-any.whl.metadata (29 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow)
  Downloading cachetools-5.3.3-py3-none-any.whl.metadata (5.3 kB)
Collecting click<9,>=7.0 (from mlflow)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting cloudpickle<4 (from mlflow)
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting entrypoints<1 (from mlflow)
  Downloading entrypoints-0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting gitpython<4,>=3.1.9 (from mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.wh

In [2]:
# Q1
! mlflow --version

mlflow, version 2.13.0


# Q2. Download and preprocess the data

In [3]:
links = [
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-03.parquet"
]

# download data to TAXI_DATA_FOLDER subdirectory replace if exists
for link in links:
    ! wget -N -P TAXI_DATA_FOLDER $link

--2024-05-24 09:11:12--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.81, 3.160.203.53, 3.160.203.173, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.81|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘TAXI_DATA_FOLDER/green_tripdata_2023-01.parquet’ not modified on server. Omitting download.

--2024-05-24 09:11:14--  https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.160.203.53, 3.160.203.173, 3.160.203.184, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.160.203.53|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘TAXI_DATA_FOLDER/green_tripdata_2023-02.parquet’ not modified on server. Omitting download.

--2024-05-24 09:11:

In [4]:
# download preprocess_data.py
! wget -N https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/preprocess_data.py

--2024-05-24 09:11:27--  https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/preprocess_data.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2510 (2.5K) [text/plain]
Saving to: ‘preprocess_data.py’


Last-modified header missing -- time-stamps turned off.
2024-05-24 09:11:27 (44.6 MB/s) - ‘preprocess_data.py’ saved [2510/2510]



In [5]:
! python preprocess_data.py --raw_data_path TAXI_DATA_FOLDER --dest_path ./output

In [6]:
# Q2
# number of files in output directory
import os
output_dir = './output'
num_files = len([name for name in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, name))])
num_files

4

# Q3. Train a model with autolog

In [7]:
# download train.py
# ! wget -N https://raw.githubusercontent.com/DataTalksClub/mlops-zoomcamp/main/cohorts/2024/02-experiment-tracking/homework/train.py

In [20]:
import os
import pickle
import click

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import mlflow
from mlflow.sklearn import autolog

In [21]:
default="./output"

In [22]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [27]:
def run_train(data_path: str):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    with mlflow.start_run():
        autolog()
        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

In [28]:
run_train(default)





In [10]:
# run train.py
! python train.py --data_path ./output

