In [1]:
!pip install prefect

Collecting prefect
  Downloading prefect-3.4.13-py3-none-any.whl.metadata (13 kB)
Collecting aiosqlite<1.0.0,>=0.17.0 (from prefect)
  Downloading aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting apprise<2.0.0,>=1.1.0 (from prefect)
  Downloading apprise-1.9.4-py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.8/55.8 kB[0m [31m222.5 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting asgi-lifespan<3.0,>=1.0 (from prefect)
  Downloading asgi_lifespan-2.1.0-py3-none-any.whl.metadata (10 kB)
Collecting asyncpg<1.0.0,>=0.23 (from prefect)
  Downloading asyncpg-0.30.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)
Collecting click<8.2,>=8.0 (from prefect)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting coolname<3.0.0,>=1.0.4 (from prefect)
  Downloading coolname-2.2.0-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting cryptography>=36.0.1 (from prefect)
  Downlo

In [2]:
!prefect version

Version:             3.4.13
API version:         0.8.4
Python version:      3.12.3
Git commit:          2b0ea9be
Built:               Thu, Aug 14, 2025 09:07 PM
OS/Arch:             linux/x86_64
Profile:             ephemeral
Server type:         ephemeral
Pydantic version:    2.11.7
Server:
  Database:          sqlite
  SQLite version:    3.45.1


In [9]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import os

In [10]:
# Load the parquet file
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet"
df = pd.read_parquet(url)

print(f"Loaded {len(df):,} records")

Loaded 3,403,766 records


In [18]:
def prepare_data(df):
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df['duration'].dt.total_seconds() / 60
    
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy() 

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

# 👇 Assign df_clean BEFORE printing
df_clean = prepare_data(df).copy()
print(f"Data after preparation: {len(df_clean):,} rows")

Data after preparation: 3,316,216 rows


In [12]:
## Feature Engineering and Split

In [32]:
from sklearn.model_selection import train_test_split

categorical = ['PULocationID', 'DOLocationID']
df_clean['target'] = df_clean['duration']

train_df, val_df = train_test_split(df_clean, test_size=0.2, random_state=42)

def df_to_dict(df):
    return df[categorical].to_dict(orient='records')

X_train_dict = df_to_dict(train_df)
X_val_dict = df_to_dict(val_df)

dv = DictVectorizer()
X_train = dv.fit_transform(X_train_dict)
X_val = dv.transform(X_val_dict)

y_train = train_df['target'].values
y_val = val_df['target'].values

In [14]:
## Train Model and Log Flow with mlflow

In [41]:
!lsof -i :5000

COMMAND  PID   USER   FD   TYPE DEVICE SIZE/OFF NODE NAME
python3 7624 syadav    5u  IPv4 344440      0t0  TCP localhost:5000 (LISTEN)
python3 7625 syadav    5u  IPv4 344440      0t0  TCP localhost:5000 (LISTEN)
python3 7625 syadav    9u  IPv4 344509      0t0  TCP localhost:5000->localhost:48430 (CLOSE_WAIT)
python3 7627 syadav    5u  IPv4 344440      0t0  TCP localhost:5000 (LISTEN)
python3 7628 syadav    5u  IPv4 344440      0t0  TCP localhost:5000 (LISTEN)
python3 7941 syadav    5u  IPv4 344440      0t0  TCP localhost:5000 (LISTEN)


In [40]:
import mlflow
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from mlflow.models.signature import infer_signature

mlflow.set_tracking_uri("sqlite:///mlflow.db.hw3")
mlflow.set_experiment("hw3-orchestration-nyc-taxi")

# End the current active run if exists
if mlflow.active_run() is not None:
    mlflow.end_run()  # End the existing active run

with mlflow.start_run():
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("rmse", rmse)

    signature = infer_signature(X_val, y_pred)
    input_example = X_val[:1]

    mlflow.sklearn.log_model(
        sk_model=lr,
        name="models",
        signature=signature,
        input_example=input_example
    )

    print(f"✅ RMSE: {rmse:.2f}")
    print(f"✅ Intercept: {lr.intercept_:.2f}")

✅ RMSE: 8.15
✅ Intercept: 24.75


In [25]:
## Save DictVectorizer and Register the Model

In [35]:
import pickle
import pathlib

output_dir = pathlib.Path("artifacts-hw3")
output_dir.mkdir(exist_ok=True)

# Save dict vectorizer
with open(output_dir / "dv.pkl", "wb") as f_out:
    pickle.dump(dv, f_out)

mlflow.log_artifact(str(output_dir / "dv.pkl"))

print("✅ DictVectorizer saved and logged.")

✅ DictVectorizer saved and logged.


In [27]:
## Find model size in MLmodel

In [36]:
import os

mlflow_dir = "mlruns"
for root, dirs, files in os.walk(mlflow_dir):
    for file in files:
        if file == "MLmodel":
            full_path = os.path.join(root, file)
            print("✅ Found MLmodel at:", full_path)

✅ Found MLmodel at: mlruns/1/models/m-78f95097bf23458296c6ac7f0cd0f42f/artifacts/MLmodel
✅ Found MLmodel at: mlruns/1/models/m-e7dcf5b6d8d44d94859afb0929399402/artifacts/MLmodel


In [37]:
print(f"✅ Final Answers Summary")
print(f"- Records loaded: {len(df):,}")
print(f"- Records after prep: {len(df_clean):,}")
print(f"- Intercept: {lr.intercept_:.2f}")
print(f"- RMSE: {rmse:.2f}")

✅ Final Answers Summary
- Records loaded: 3,403,766
- Records after prep: 3,316,216
- Intercept: 24.75
- RMSE: 8.15


In [31]:
# Load and read the MLmodel file to inspect the metadata
with open(mlmodel_path, "r") as f:
    mlmodel_content = f.read()
print("MLmodel Content:")
print(mlmodel_content)

# Find model siz
model_file_path = os.path.join(artifact_uri.replace("file://", ""), "model.pkl")  # Adjust depending on sklearn model file
model_size_bytes = os.path.getsize(model_file_path)
print(f"Model size in bytes: {model_size_bytes}")

Artifact URI: /mnt/c/Users/shubham_yadav/Documents/COXA-ENIT/lab/venv/MLOps Zoompcamp/mlruns/1/a94fdf70dc85499e9d2d55f0ec85671c/artifacts/models
MLmodel file path: mlruns


IsADirectoryError: [Errno 21] Is a directory: 'mlruns'

In [3]:
!pip install pathlib

Collecting pathlib
  Downloading pathlib-1.0.1-py3-none-any.whl.metadata (5.1 kB)
Downloading pathlib-1.0.1-py3-none-any.whl (14 kB)
Installing collected packages: pathlib
Successfully installed pathlib-1.0.1
