In [2]:
import requests
import datetime
import pandas as pd
import os

from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric, ColumnQuantileMetric
from evidently.report import Report
from evidently.test_suite import TestSuite
from evidently.tests import TestColumnDrift

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [3]:
files = [('green_tripdata_2024-03.parquet', './data'),]

print("Download files:")
for file, path in files:
    # Create the directory if it doesn't exist
    os.makedirs(path, exist_ok=True)
    
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp = requests.get(url, stream=True)
    save_path = f"{path}/{file}"
    
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                         desc=f"{file}",
                         postfix=f"save to {save_path}",
                         total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2024-03.parquet: 100%|███████████████████████████████████████████████████████| 1372372/1372372 [00:10<00:00, 136178.06it/s, save to ./data/green_tripdata_2024-03.parquet]


In [4]:
march_data = pd.read_parquet('data/green_tripdata_2024-03.parquet')

In [5]:
march_data.describe()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,57457.0,57457,57457,55360.0,57457.0,57457.0,55360.0,57457.0,57457.0,57457.0,57457.0,57457.0,57457.0,0.0,57457.0,57457.0,55360.0,55353.0,55360.0
mean,1.877334,2024-03-16 04:02:52.405399,2024-03-16 04:21:00.076039,1.179986,95.524688,138.629149,1.309538,13.522828,17.313474,0.904472,0.57741,2.386255,0.192537,,0.979378,22.904832,1.321062,1.038047,0.73773
min,1.0,2008-12-31 23:02:24,2008-12-31 23:02:30,1.0,1.0,1.0,0.0,0.0,-295.08,-2.5,-0.5,-1.56,0.0,,-1.0,-296.08,1.0,1.0,-2.75
25%,2.0,2024-03-08 13:53:56,2024-03-08 14:13:49,1.0,74.0,74.0,1.0,1.1,9.3,0.0,0.5,0.0,0.0,,1.0,13.44,1.0,1.0,0.0
50%,2.0,2024-03-15 22:49:01,2024-03-15 23:09:52,1.0,75.0,138.0,1.0,1.79,13.5,0.0,0.5,2.0,0.0,,1.0,18.5,1.0,1.0,0.0
75%,2.0,2024-03-23 20:11:25,2024-03-23 20:34:48,1.0,97.0,220.0,1.0,3.1,19.8,1.0,0.5,3.61,0.0,,1.0,27.05,2.0,1.0,2.75
max,2.0,2024-04-01 00:01:45,2024-04-01 16:11:00,99.0,265.0,265.0,9.0,125112.2,841.6,10.0,4.25,150.0,26.76,,1.0,856.98,5.0,2.0,2.75
std,0.328056,,,1.356719,57.285088,76.295346,0.967749,770.416255,14.958249,1.382446,0.366916,3.159273,1.184551,,0.154253,17.013735,0.497858,0.191311,1.218039


In [6]:
march_data.shape

(57457, 20)

Q2

In [7]:
data = pd.read_parquet('./data/green_tripdata_2024-03.parquet')

data = data.dropna(axis=1, how='all')


for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = data[column].fillna('Unknown')
    else:
        data[column] = data[column].fillna(data[column].mean())


split_point = int(len(data) * 0.7)
ref_data = data.iloc[:split_point].copy()
cur_data = data.iloc[split_point:].copy()

report = Report(metrics=[
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    ColumnDriftMetric(column_name="fare_amount"),
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5),
    ColumnDistributionMetric(column_name="fare_amount")
])

report.run(reference_data=ref_data, current_data=cur_data)

report.save_html("report.html")

NameError: name 'ColumnDistributionMetric' is not defined

Q3

In [8]:
import pandas as pd

# Load the data
data = pd.read_parquet('./data/green_tripdata_2024-03.parquet')

# Convert the datetime column to datetime type if it's not already
data['datetime'] = pd.to_datetime(data['lpep_pickup_datetime'])

# Filter for March 2024 data only
march_2024_data = data[(data['datetime'].dt.year == 2024) & (data['datetime'].dt.month == 3)]

# Group the data by day and calculate the daily median fare
daily_medians = march_2024_data.groupby(march_2024_data['datetime'].dt.date)['fare_amount'].median()

# Find the maximum daily median
max_daily_median = daily_medians.max()

print(f"The maximum daily median fare amount in March 2024 is: {max_daily_median:.1f}")

# Print all daily medians for March 2024
print("\nDaily medians for March 2024:")
print(daily_medians)

The maximum daily median fare amount in March 2024 is: 14.2

Daily medians for March 2024:
datetime
2024-03-01    13.5
2024-03-02    13.5
2024-03-03    14.2
2024-03-04    12.8
2024-03-05    13.5
2024-03-06    12.8
2024-03-07    13.5
2024-03-08    13.5
2024-03-09    13.5
2024-03-10    14.2
2024-03-11    12.8
2024-03-12    13.5
2024-03-13    13.5
2024-03-14    14.2
2024-03-15    13.5
2024-03-16    14.2
2024-03-17    13.5
2024-03-18    13.5
2024-03-19    13.5
2024-03-20    12.8
2024-03-21    13.5
2024-03-22    13.5
2024-03-23    12.8
2024-03-24    14.2
2024-03-25    13.5
2024-03-26    13.5
2024-03-27    13.5
2024-03-28    13.5
2024-03-29    13.5
2024-03-30    14.2
2024-03-31    13.5
Name: fare_amount, dtype: float64


Q4