In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse, root_mean_squared_error as rmse

In [2]:
jan_2023 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
feb_2023 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'


df_jan = pd.read_parquet(jan_2023)
df_feb = pd.read_parquet(feb_2023)

print(f'Dataset dimensions for January -> {df_jan.shape=}')
print(f'Dataset dimensions for February -> {df_feb.shape=}')

Dataset dimensions for January -> df_jan.shape=(3066766, 19)
Dataset dimensions for February -> df_feb.shape=(2913955, 19)


In [3]:
df_jan['duration'] = df_jan.tpep_dropoff_datetime - df_jan.tpep_pickup_datetime
df_jan.duration = df_jan.duration.apply(lambda td: td.total_seconds() / 60)

print(f'Standard deviation for January -> {(df_jan["duration"].std()):.2f}')

Standard deviation for January -> 42.59


In [4]:
before = df_jan.shape[0]

df_jan = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]
after = df_jan.shape[0]

print(f'New fraction records -> {(after / before)*100:.0f}%')

New fraction records -> 98%


In [5]:
categorical = ['PULocationID', 'DOLocationID']
df_jan[categorical] = df_jan[categorical].astype(str)

records = df_jan[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(records)

print(f'Dimensionality -> {X_train.shape=}')

Dimensionality -> X_train.shape=(3009173, 515)


In [6]:
target = 'duration'
y_train = df_jan[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)
mse_calculated = mse(y_train, y_train_pred)
rmse_calculated = rmse(y_train, y_train_pred)

print(f'(MSE) Error calculated with Linear Regression --> {mse_calculated:.4f}')
print(f'(RMSE) Error calculated with Linear Regression --> {rmse_calculated:.4f}')

(MSE) Error calculated with Linear Regression --> 58.5112
(RMSE) Error calculated with Linear Regression --> 7.6493


**Final Question ->**

In [7]:
df_feb['duration'] = df_feb.tpep_dropoff_datetime - df_feb.tpep_pickup_datetime
df_feb.duration = df_feb.duration.apply(lambda td: td.total_seconds() / 60)
df_feb = df_feb[(df_feb.duration >= 1) & (df_feb.duration <= 60)]

In [8]:
df_feb[categorical] = df_feb[categorical].astype(str)
val_records = df_feb[categorical].to_dict(orient='records')

X_val = dv.transform(val_records)
y_val = df_feb[target].values

In [9]:
y_val_pred = lr.predict(X_val)
val_mse_calculated = mse(y_val, y_val_pred)
val_rmse_calculated = rmse(y_val, y_val_pred)

print(f'(MSE) Error calculated with Linear Regression --> {val_mse_calculated:.4f}')
print(f'(RMSE) Error calculated with Linear Regression --> {val_rmse_calculated:.4f}')

(MSE) Error calculated with Linear Regression --> 61.0245
(RMSE) Error calculated with Linear Regression --> 7.8118
