In [60]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
jan = pd.read_parquet("../data/raw/yellow_tripdata_2023-01.parquet")
feb = pd.read_parquet("../data/raw/yellow_tripdata_2023-02.parquet")

## Q1

In [3]:
jan.shape

(3066766, 19)

In [4]:
jan.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0


## Q2

In [28]:
jan['duration'] = (jan['tpep_dropoff_datetime'] - jan['tpep_pickup_datetime']) / pd.Timedelta('60s')

In [29]:
jan['duration'].std()

42.59435124195458

## Q3

In [30]:
size_bfr_drop = jan.shape[0]

In [32]:
jan = jan[(jan['duration'] >= 1) & (jan['duration'] <= 60)]

In [38]:
jan.shape[0] / size_bfr_drop

0.9812202822125979

## Q4

In [39]:
jan.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
duration                        float64
dtype: object

In [69]:
jan = jan.astype({'PULocationID': 'str',
                  'DOLocationID': 'str'})

In [73]:
feature_matrix = jan[["PULocationID", "DOLocationID"]].to_dict(orient='records')

In [74]:
dv = DictVectorizer()
feature_matrix = dv.fit_transform(feature_matrix)

In [75]:
feature_matrix.shape

(3009173, 515)

## Q5

In [76]:
y_train = jan['duration'].values

In [77]:
lr = LinearRegression()
lr.fit(feature_matrix, y_train)

In [78]:
y_train_pred = lr.predict(feature_matrix)
rmse = mean_squared_error(y_train, y_train_pred, squared=False)

In [79]:
rmse

7.649261929201487

## Q6

Pre-processing validation dataset

In [80]:
feb['duration'] = (feb['tpep_dropoff_datetime'] - feb['tpep_pickup_datetime']) / pd.Timedelta('60s')
feb = feb[(feb['duration'] >= 1) & (feb['duration'] <= 60)]
feb = feb.astype({'PULocationID': 'str',
                  'DOLocationID': 'str'})
feature_matrix_val = feb[["PULocationID", "DOLocationID"]].to_dict(orient='records')
feature_matrix_val = dv.transform(feature_matrix_val)
y_val = feb['duration'].values

In [81]:
y_val_pred = lr.predict(feature_matrix_val)
rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)

In [82]:
rmse_val

7.811819793542861