In [1]:
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnQuantileMetric, ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [None]:
files = [("green_tripdata_2024-03.parquet", "./data")]
print("Download files")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

In [2]:
mar_data = pd.read_parquet('./data/green_tripdata_2024-03.parquet')

### Question 1

In [3]:
mar_data.shape

(57457, 20)

In [4]:
mar_data

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2024-03-01 00:10:52,2024-03-01 00:26:12,N,1.0,129,226,1.0,1.72,12.80,1.0,0.5,3.06,0.00,,1.0,18.36,1.0,1.0,0.00
1,2,2024-03-01 00:22:21,2024-03-01 00:35:15,N,1.0,130,218,1.0,3.25,17.70,1.0,0.5,0.00,0.00,,1.0,20.20,2.0,1.0,0.00
2,2,2024-03-01 00:45:27,2024-03-01 01:04:32,N,1.0,255,107,2.0,4.58,23.30,1.0,0.5,3.50,0.00,,1.0,32.05,1.0,1.0,2.75
3,1,2024-03-01 00:02:00,2024-03-01 00:23:45,N,1.0,181,71,1.0,0.00,22.50,0.0,1.5,0.00,0.00,,1.0,24.00,1.0,1.0,0.00
4,2,2024-03-01 00:16:45,2024-03-01 00:23:25,N,1.0,95,135,1.0,1.15,8.60,1.0,0.5,1.00,0.00,,1.0,12.10,1.0,1.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57452,2,2024-03-31 21:19:00,2024-03-31 21:30:00,,,25,61,,1.45,12.08,0.0,0.0,2.52,0.00,,1.0,15.60,,,
57453,2,2024-03-31 22:30:00,2024-03-31 22:35:00,,,41,42,,1.13,12.24,0.0,0.0,0.00,0.00,,1.0,13.24,,,
57454,2,2024-03-31 22:43:00,2024-03-31 22:48:00,,,223,7,,13062.08,12.08,0.0,0.0,3.77,0.00,,1.0,16.85,,,
57455,2,2024-03-31 22:48:00,2024-03-31 23:12:00,,,42,249,,7.96,40.52,0.0,0.0,8.75,0.00,,1.0,53.02,,,


In [5]:
# Create target
mar_data["duration_min"] = mar_data.lpep_dropoff_datetime - mar_data.lpep_pickup_datetime
mar_data.duration_min = mar_data.duration_min.apply(lambda td : float(td.total_seconds())/60)

In [6]:
# Filter out the outliers
mar_data = mar_data[(mar_data.duration_min >=0) & (mar_data.duration_min <=60)]
jan_data = mar_data[(mar_data.passenger_count >0) & (mar_data.passenger_count <=8)]

In [7]:
# Data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [8]:
train_data = mar_data[:30000]
val_data = mar_data[30000:]

In [9]:
model = LinearRegression()

In [10]:
model.fit(train_data[num_features + cat_features], train_data[target])

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [11]:
train_preds = model.predict(train_data[num_features + cat_features].fillna(0))
train_data['prediction'] = train_preds

In [12]:
val_preds = model.predict(val_data[num_features + cat_features].fillna(0))
val_data['prediction'] = val_preds

In [13]:
print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

3.7814743539289593
28.48404707338729


### Question 2

In [14]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

In [15]:
report = Report(metrics=[
    ColumnQuantileMetric(column_name='fare_amount', quantile=0.5)
]
)

In [16]:
report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)

In [17]:
result = report.as_dict()

In [32]:
result['metrics'][0]['result']['current']['value']

np.float64(13.5)

In [None]:
report.show(mode='inline')