In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [3]:
print(f"Number of columns: {df.shape[1]}")

Number of columns: 19


In [None]:
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['duration'] = df['duration'].dt.total_seconds() / 60
print(f"Standard deviation: {df['duration'].std():.2f}")

Standard deviation: 42.59
Standard deviation: count    3066766.00
mean          15.67
std           42.59
min          -29.20
25%            7.12
50%           11.52
75%           18.30
max        10029.18
Name: duration, dtype: object


In [5]:

original_count = len(df)
df = df[(df.duration >= 1) & (df.duration <= 60)]
filtered_count = len(df)
print(f"Original count: {original_count}, Filtered count: {filtered_count}")
print(f"Fraction of records left: {filtered_count / original_count:.2%}")

Original count: 3066766, Filtered count: 3009173
Fraction of records left: 98.12%


In [6]:
categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)
dicts = df[categorical].to_dict(orient='records')
dv = DictVectorizer()
X = dv.fit_transform(dicts)
print(f"Feature matrix shape: {X.shape}")

Feature matrix shape: (3009173, 515)


In [7]:
y = df['duration'].values
lr = LinearRegression()
lr.fit(X, y)
y_pred = lr.predict(X)
rmse = np.sqrt(mean_squared_error(y, y_pred))
print(f"RMSE on training data: {rmse:.2f}")

RMSE on training data: 7.65


In [8]:
def load_data(path):
    df = pd.read_parquet(path)
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    df = df[(df.duration > 1) & (df.duration < 60)]
    df[['PULocationID', 'DOLocationID']] = df[['PULocationID', 'DOLocationID']].astype(str)
    return df

In [9]:
df_val = load_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [10]:
df_val['duration'] = df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']
df_val['duration'] = df_val['duration'].dt.total_seconds() / 60
df_val = df_val[(df_val.duration > 1) & (df_val.duration < 60)]

In [11]:
df_val[['PULocationID', 'DOLocationID']] = df_val[['PULocationID', 'DOLocationID']].astype(str)
dicts_val = df_val[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_val = dv.transform(dicts_val)

In [12]:
y_val = df_val['duration'].values
y_pred_val = lr.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f"RMSE on validation data: {rmse_val:.2f}")

RMSE on validation data: 7.81
