In [None]:
# This block of code is used to add the root folder of the project to the path so that src can be imported.
import os
import sys

root_folder = os.path.dirname(os.path.abspath(""))
if not root_folder in sys.path:
    sys.path.append(root_folder)

In [None]:
from datetime import datetime, timedelta
from pathlib import Path

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import (
    ColumnQuantileMetric,
    ColumnDriftMetric,
    DatasetDriftMetric,
    DatasetMissingValuesMetric,
    DatasetCorrelationsMetric,
)
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

from src import read_trips, save_model

In [None]:
DATA_FOLDER = Path("../data/")
MODEL_FOLDER = Path("../models/")

TARGET = "duration"
CATEGORICAL_COLS = ["PULocationID", "DOLocationID"]
NUMERICAL_COLS = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]


def process_trips(trips: pd.DataFrame) -> pd.DataFrame:
    trips["duration"] = trips["lpep_dropoff_datetime"] - trips["lpep_pickup_datetime"]
    trips["duration"] = trips["duration"].apply(lambda td: td.total_seconds() / 60)

    duration_outliers_mask = (trips["duration"] >= 0) & (trips["duration"] <= 60)
    passenger_count_outliers_mask = (trips["passenger_count"] > 0) & (
        trips["passenger_count"] <= 8
    )

    trips = trips[duration_outliers_mask & passenger_count_outliers_mask]

    trips = trips[NUMERICAL_COLS + CATEGORICAL_COLS + [TARGET]]

    return trips

# Prepare reference data and model

In [None]:
trips_data = read_trips(data_folder=DATA_FOLDER, color="green", year="2022", month="1")
trips_data = process_trips(trips_data)

train_data = trips_data[:30000]
val_data = trips_data[30000:]

model = LinearRegression()
model.fit(train_data[NUMERICAL_COLS + CATEGORICAL_COLS], train_data[TARGET])

train_data["prediction"] = model.predict(train_data[NUMERICAL_COLS + CATEGORICAL_COLS])
val_data["prediction"] = model.predict(val_data[NUMERICAL_COLS + CATEGORICAL_COLS])

In [None]:
val_data.to_parquet(DATA_FOLDER / "reference_data.parquet")
save_model(MODEL_FOLDER, "model.pkl", model)

# Monitoring metrics calculated on the new data

In [None]:
color = "green"
year = "2023"
month = "3"

In [None]:
# New data
trips_data = read_trips(data_folder=DATA_FOLDER, color=color, year=year, month=month)
print(f"Number of rows: {trips_data.shape[0]}")

# Filter out dates that are not in the month
trips_data = trips_data[
    (trips_data["lpep_pickup_datetime"] >= datetime(2023, int(month), 1, 0, 0))
    & (
        trips_data["lpep_pickup_datetime"]
        <= datetime(2023, int(month) + 1, 1, 0, 0) - timedelta(days=1)
    )
]

In [None]:
# Prepare Report
column_mapping = ColumnMapping(
    target=None,
    prediction="prediction",
    numerical_features=NUMERICAL_COLS,
    categorical_features=CATEGORICAL_COLS,
)

report = Report(
    metrics=[
        ColumnDriftMetric(column_name="prediction"),
        ColumnQuantileMetric(column_name="fare_amount", quantile=0.5),
        DatasetDriftMetric(),
        DatasetMissingValuesMetric(),
        DatasetCorrelationsMetric(),
    ]
)

In [None]:
quantile_metrics = []
dates = sorted(trips_data["lpep_pickup_datetime"].dt.date.unique())

for date in dates:
    trips_data_day = trips_data[trips_data["lpep_pickup_datetime"].dt.date == date]

    trips_data_day["prediction"] = model.predict(
        trips_data_day[NUMERICAL_COLS + CATEGORICAL_COLS].fillna(0)
    )
    report.run(
        reference_data=val_data,
        current_data=trips_data_day,
        column_mapping=column_mapping,
    )
    result = report.as_dict()
    quantile_metrics.append(result["metrics"][1]["result"]["current"]["value"])

In [None]:
print(f"The largest median of fare amount: {np.max(quantile_metrics)}")