In [2]:
import requests
import datetime
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [4]:
files = [('green_tripdata_2023-03.parquet', './data'), ('green_tripdata_2023-01.parquet', './data')]

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2023-03.parquet:   1%|          | 10117/1730999 [00:00<00:17, 101157.07it/s, save to ./data/green_tripdata_2023-03.parquet]

green_tripdata_2023-03.parquet: 100%|██████████| 1730999/1730999 [00:16<00:00, 103905.42it/s, save to ./data/green_tripdata_2023-03.parquet]
green_tripdata_2023-01.parquet: 100%|██████████| 1427002/1427002 [00:13<00:00, 104464.84it/s, save to ./data/green_tripdata_2023-01.parquet]


In [6]:
march_data = pd.read_parquet('data/green_tripdata_2023-03.parquet')

In [7]:
march_data.describe()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,72044.0,72044,72044,67507.0,72044.0,72044.0,67507.0,72044.0,72044.0,72044.0,72044.0,72044.0,72044.0,0.0,72044.0,72044.0,67507.0,67500.0,67507.0
mean,1.865443,2023-03-16 08:02:42.689842432,2023-03-16 08:20:56.525747968,1.145703,98.437413,137.613556,1.286844,12.449511,17.018203,0.878109,0.576929,2.157651,0.183268,,0.961604,22.29231,1.37368,1.028681,0.714837
min,1.0,2023-02-23 16:46:29,2023-02-23 16:55:25,1.0,1.0,1.0,0.0,0.0,-115.0,-5.0,-0.5,-1.1,0.0,,-1.0,-116.0,1.0,1.0,-2.75
25%,2.0,2023-03-08 13:55:49.500000,2023-03-08 14:15:59.750000128,1.0,74.0,74.0,1.0,1.1,9.3,0.0,0.5,0.0,0.0,,1.0,12.98,1.0,1.0,0.0
50%,2.0,2023-03-16 08:05:27.500000,2023-03-16 08:22:27,1.0,75.0,138.0,1.0,1.84,13.5,0.0,0.5,1.6,0.0,,1.0,18.1,1.0,1.0,0.0
75%,2.0,2023-03-23 21:51:25.500000,2023-03-23 22:12:31.500000,1.0,129.0,216.0,1.0,3.3,20.0,1.0,0.5,3.36,0.0,,1.0,26.73,2.0,1.0,2.75
max,2.0,2023-04-01 00:09:27,2023-04-01 17:32:21,99.0,265.0,265.0,9.0,92064.68,477.0,12.5,4.25,270.27,36.05,,1.0,478.0,5.0,2.0,2.75
std,0.341252,,,1.225735,60.48187,76.169705,0.923652,641.094653,13.832399,1.297642,0.384129,3.136215,1.139159,,0.185185,15.852047,0.512307,0.166911,1.206435
