In [3]:


import requests
import datetime
import pandas as pd
import os


from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error



In [4]:
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric, DatasetCorrelationsMetric, ColumnQuantileMetric


ImportError: cannot import name 'ColumnMapping' from 'evidently' (/home/sergio/.local/lib/python3.10/site-packages/evidently/__init__.py)

In [5]:
files = [('green_tripdata_2024-03.parquet', './data')]

# Create the data directory
os.makedirs('./data', exist_ok=True)

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

Download files:


green_tripdata_2024-03.parquet: 100%|██████████| 1372372/1372372 [00:05<00:00, 237445.01it/s, save to ./data/green_tripdata_2024-03.parquet]


In [6]:
march_data = pd.read_parquet('data/green_tripdata_2024-03.parquet')

In [7]:
march_data.describe()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,57457.0,57457,57457,55360.0,57457.0,57457.0,55360.0,57457.0,57457.0,57457.0,57457.0,57457.0,57457.0,0.0,57457.0,57457.0,55360.0,55353.0,55360.0
mean,1.877334,2024-03-16 04:02:52.405399,2024-03-16 04:21:00.076039,1.179986,95.524688,138.629149,1.309538,13.522828,17.313474,0.904472,0.57741,2.386255,0.192537,,0.979378,22.904832,1.321062,1.038047,0.73773
min,1.0,2008-12-31 23:02:24,2008-12-31 23:02:30,1.0,1.0,1.0,0.0,0.0,-295.08,-2.5,-0.5,-1.56,0.0,,-1.0,-296.08,1.0,1.0,-2.75
25%,2.0,2024-03-08 13:53:56,2024-03-08 14:13:49,1.0,74.0,74.0,1.0,1.1,9.3,0.0,0.5,0.0,0.0,,1.0,13.44,1.0,1.0,0.0
50%,2.0,2024-03-15 22:49:01,2024-03-15 23:09:52,1.0,75.0,138.0,1.0,1.79,13.5,0.0,0.5,2.0,0.0,,1.0,18.5,1.0,1.0,0.0
75%,2.0,2024-03-23 20:11:25,2024-03-23 20:34:48,1.0,97.0,220.0,1.0,3.1,19.8,1.0,0.5,3.61,0.0,,1.0,27.05,2.0,1.0,2.75
max,2.0,2024-04-01 00:01:45,2024-04-01 16:11:00,99.0,265.0,265.0,9.0,125112.2,841.6,10.0,4.25,150.0,26.76,,1.0,856.98,5.0,2.0,2.75
std,0.328056,,,1.356719,57.285088,76.295346,0.967749,770.416255,14.958249,1.382446,0.366916,3.159273,1.184551,,0.154253,17.013735,0.497858,0.191311,1.218039


In [8]:


# create target
march_data["duration_min"] = march_data.lpep_dropoff_datetime - march_data.lpep_pickup_datetime
march_data.duration_min = march_data.duration_min.apply(lambda td : float(td.total_seconds())/60)



In [9]:


# filter out outliers
march_data = march_data[(march_data.duration_min >= 0) & (march_data.duration_min <= 60)]
march_data = march_data[(march_data.passenger_count > 0) & (march_data.passenger_count <= 8)]



In [10]:


# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]



In [11]:
train_data = march_data[:30000]
val_data = march_data[30000:]

model = LinearRegression()

model.fit(train_data[num_features + cat_features], train_data[target])

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [12]:


train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds



In [13]:


with open('models/lin_reg.bin', 'wb') as f_out:
    dump(model, f_out)



In [14]:
val_data.to_parquet('data/reference.parquet')


In [None]:
from evidently import Report, Dataset, DataDefinition, ColumnType
from evidently.presets import DataDriftPreset, DataSummaryPreset
from evidently.metrics import ColumnCorrelationsMetric, DatasetMissingValuesMetric

# Define los tipos de columna
col_types = {c: ColumnType.NUMERIC for c in num_features}
col_types.update({c: ColumnType.CATEGORICAL for c in cat_features})
col_types["prediction"] = ColumnType.PREDICTION
definition = DataDefinition(col_types)

# Crea datasets
ref = Dataset.from_pandas(train_data, data_definition=definition)
cur = Dataset.from_pandas(val_data, data_definition=definition)

# Define reporte con presets y métricas
report = Report(metrics=[
    DataDriftPreset(),
    DataSummaryPreset(),  # incluye estadísticas básicas
    ColumnCorrelationsMetric(),      # añade correlaciones
    DatasetMissingValuesMetric()     # añade detalle de valores faltantes
])

report.run(reference_dataset=ref, current_dataset=cur)
report.show(mode='inline')
result = report.as_dict()


hola
