In [1]:
import requests
import datetime
import pandas as pd
import os

from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric, ColumnQuantileMetric
from evidently.report import Report
from evidently.test_suite import TestSuite
from evidently.tests import TestColumnDrift

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
files = [('green_tripdata_2024-03.parquet', './data'),]

print("Download files:")
for file, path in files:
    # Create the directory if it doesn't exist
    os.makedirs(path, exist_ok=True)
    
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp = requests.get(url, stream=True)
    save_path = f"{path}/{file}"
    
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                         desc=f"{file}",
                         postfix=f"save to {save_path}",
                         total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2024-03.parquet: 100%|███████████████████████████████| 1372372/1372372 [00:07<00:00, 191253.88it/s, save to ./data/green_tripdata_2024-03.parquet]


In [4]:
march_data = pd.read_parquet('data/green_tripdata_2022-03.parquet')

In [5]:
march_data.describe()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,78537.0,78537,78537,70472.0,78537.0,78537.0,70472.0,78537.0,78537.0,78537.0,78537.0,78537.0,78537.0,78537.0,78537.0,70472.0,70471.0,70472.0
mean,1.853037,2022-03-16 16:32:03.594598,2022-03-16 16:52:15.559965,1.192814,99.884882,137.069254,1.297267,66.594598,14.423273,0.363895,0.426348,1.892951,0.236332,0.297383,18.279519,1.387203,1.040556,0.710459
min,1.0,2022-02-07 14:40:24,2022-02-07 15:04:58,1.0,1.0,1.0,0.0,0.0,-115.0,-4.5,-0.5,-0.86,0.0,-0.3,-115.3,1.0,1.0,-2.75
25%,2.0,2022-03-08 19:23:11,2022-03-08 19:46:24,1.0,65.0,74.0,1.0,1.12,7.5,0.0,0.5,0.0,0.0,0.3,9.8,1.0,1.0,0.0
50%,2.0,2022-03-16 18:23:05,2022-03-16 18:41:20,1.0,75.0,138.0,1.0,1.96,11.0,0.0,0.5,1.22,0.0,0.3,14.3,1.0,1.0,0.0
75%,2.0,2022-03-24 14:54:54,2022-03-24 15:13:00,1.0,133.0,215.0,1.0,3.69,17.39,0.5,0.5,2.81,0.0,0.3,21.86,2.0,1.0,2.75
max,5.0,2022-04-01 00:01:54,2022-04-01 16:17:27,99.0,265.0,265.0,9.0,210531.77,500.0,4.5,3.55,450.0,36.6,0.3,500.0,5.0,2.0,2.75
std,0.354287,,,0.920412,63.901169,76.648979,0.948157,2656.980656,12.900992,0.658673,0.181418,3.802783,1.293668,0.035424,14.94798,0.508214,0.19726,1.203958


In [6]:
march_data.shape

(78537, 20)

Q2

In [7]:
data = pd.read_parquet('./data/green_tripdata_2024-03.parquet')

data = data.dropna(axis=1, how='all')


for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = data[column].fillna('Unknown')
    else:
        data[column] = data[column].fillna(data[column].mean())


split_point = int(len(data) * 0.7)
ref_data = data.iloc[:split_point].copy()
cur_data = data.iloc[split_point:].copy()

report = Report(metrics=[
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    ColumnDriftMetric(column_name="fare_amount"),
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5),
    ColumnDistributionMetric(column_name="fare_amount")
])

report.run(reference_data=ref_data, current_data=cur_data)

report.save_html("report.html")

NameError: name 'ColumnDistributionMetric' is not defined

Q3

In [3]:
import pandas as pd

# Load the data
data = pd.read_parquet('./data/green_tripdata_2024-03.parquet')

# Convert the datetime column to datetime type if it's not already
data['datetime'] = pd.to_datetime(data['lpep_pickup_datetime'])

# Filter for March 2024 data only
march_2024_data = data[(data['datetime'].dt.year == 2024) & (data['datetime'].dt.month == 3)]

# Group the data by day and calculate the daily median fare
daily_medians = march_2024_data.groupby(march_2024_data['datetime'].dt.date)['fare_amount'].median()

# Find the maximum daily median
max_daily_median = daily_medians.max()

print(f"The maximum daily median fare amount in March 2024 is: {max_daily_median:.1f}")

# Print all daily medians for March 2024
print("\nDaily medians for March 2024:")
print(daily_medians)

The maximum daily median fare amount in March 2024 is: 14.2

Daily medians for March 2024:
datetime
2024-03-01    13.5
2024-03-02    13.5
2024-03-03    14.2
2024-03-04    12.8
2024-03-05    13.5
2024-03-06    12.8
2024-03-07    13.5
2024-03-08    13.5
2024-03-09    13.5
2024-03-10    14.2
2024-03-11    12.8
2024-03-12    13.5
2024-03-13    13.5
2024-03-14    14.2
2024-03-15    13.5
2024-03-16    14.2
2024-03-17    13.5
2024-03-18    13.5
2024-03-19    13.5
2024-03-20    12.8
2024-03-21    13.5
2024-03-22    13.5
2024-03-23    12.8
2024-03-24    14.2
2024-03-25    13.5
2024-03-26    13.5
2024-03-27    13.5
2024-03-28    13.5
2024-03-29    13.5
2024-03-30    14.2
2024-03-31    13.5
Name: fare_amount, dtype: float64


Q4