Download data

In [None]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

--2025-05-07 20:59:05--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 65.8.245.171, 65.8.245.50, 65.8.245.178, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|65.8.245.171|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47673370 (45M) [application/x-www-form-urlencoded]
Saving to: ‘yellow_tripdata_2023-01.parquet.1’


2025-05-07 20:59:05 (67.0 MB/s) - ‘yellow_tripdata_2023-01.parquet.1’ saved [47673370/47673370]

--2025-05-07 20:59:05--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 65.8.245.171, 65.8.245.50, 65.8.245.178, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|65.8.245.171|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47748012 (46M) [application

Imports

In [None]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error, mean_squared_error

Utility function

In [None]:
def preprocess_trip_data(df):

  df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
  df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

  categorical = ['PULocationID', 'DOLocationID']
  df[categorical] = df[categorical].astype(str)

  return df

In [None]:
def add_trip_duration(df):

  df["Duration"] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
  df["Duration"] = df["Duration"].apply(lambda td: td.total_seconds() / 60)

  return df

In [None]:
def filter(df):

  df = df[(df["Duration"] >= 1) & (df["Duration"] <= 60)]
  return df

Read data into the dataframe

In [None]:
df_train = pd.read_parquet('/content/yellow_tripdata_2023-01.parquet')
df_val = pd.read_parquet('/content/yellow_tripdata_2023-02.parquet')

In [None]:
df_train = preprocess_trip_data(df_train)
df_val = preprocess_trip_data(df_val)

Review data

In [None]:
df_train.shape, df_val.shape

((3066766, 19), (2913955, 19))

Feature engineering

In [None]:
df_train = add_trip_duration(df_train)
df_val = add_trip_duration(df_val)

In [None]:
df_train['Duration'].std()

42.594351241920904

Filtering data

In [None]:
df_train = filter(df_train)
df_val = filter(df_val)

Encoding categorical features

In [None]:
dv = DictVectorizer()
categorical = ['PULocationID', 'DOLocationID']

train_dict = df_train[categorical].to_dict(orient="records")
X_train = dv.fit_transform(train_dict)
y_train = df_train['Duration'].values
print(len(dv.feature_names_))

val_dict = df_val[categorical].to_dict(orient="records")
X_val = dv.transform(val_dict)
y_val = df_val['Duration'].values

515


Train the model

In [None]:
reg = LinearRegression().fit(X_train, y_train)

Predict on validation data

In [None]:
y_pred = reg.predict(X_val)

Calculate RMSE

In [None]:
rmse = root_mean_squared_error(y_val, y_pred)
rmse

7.811818743246608