In [1]:
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error

In [2]:
configs = {
    "jan_path": "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet",
    "feb_path": "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet",
    "categorical_features": ['PULocationID', 'DOLocationID'],
    "target": "duration"
}

In [3]:
def prepare_data(path:str, clean_duration_outliers: bool = True) -> pd.DataFrame:

    """
    Runs a basic training/validation pipeline
    
    Parameters:
    -----------
    
    path: indicates the path of the data
    clean_duration_outliers: keeps only durations between 1 and 60 minutes

    """
    
    df = pd.read_parquet(path)
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.seconds/60
    df[configs["categorical_features"]] = df[configs["categorical_features"]].astype(str)

    if mode == "train":
        l_before = df.shape[0]
        mask = df['duration'].between(1,60)
        df = df.loc[mask]
        l_after = df.shape[0]
        lost_records = l_before - l_after
        frac = (1-lost_records/l_before)*100
        
        print(f"removing outliers in duration resulted in {lost_records} lost records. {round(frac,2)}% of original data are available now")            
    return df
        

In [4]:
train = prepare_data(path=configs["jan_path"], clean_duration_outliers=True)

removing outliers in duration resulted in 57590 lost records. 98.12% of original data are available now


In [5]:
train_dicts = train[configs["categorical_features"]].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = train[configs["target"]].values

In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
root_mean_squared_error(y_train, y_pred)

7.649261254600018

In [15]:
val = prepare_data(path=configs["feb_path"], clean_duration_outliers=True)
val_dicts = val[configs["categorical_features"]].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = val[configs["target"]].values

In [16]:
y_pred = lr.predict(X_val)

In [17]:
root_mean_squared_error(y_val, y_pred)

7.811813468409544