In [22]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
import pickle

In [54]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [55]:
# Q1: Read the data for January. How many columns are there?
len(df.columns) 

19

In [56]:
df[:2]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0


In [57]:
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['duration_minutes'] = df['duration'].dt.total_seconds() / 60

In [58]:
df[:2]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration,duration_minutes
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,...,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,0 days 00:08:26,8.433333
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,...,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,0 days 00:06:19,6.316667


In [59]:
# Q2: What's the standard deviation of the trips duration in January?
df['duration_minutes'].std()

42.59435124195458

In [60]:
len(df)

3066766

In [61]:
df = df[(df['duration_minutes'] >= 1) & (df['duration_minutes'] <= 60)]

In [62]:
len(df)

3009173

In [63]:
# Q3: What fraction of the records left after you dropped the outliers?
3009173 / 3066766 * 100

98.1220282212598

In [64]:
df['PULocationID'] = df['PULocationID'].astype(str)
df['DOLocationID'] = df['DOLocationID'].astype(str)

In [65]:
categorical = df[['PULocationID', 'DOLocationID']]

In [66]:
categorical[:2]

Unnamed: 0,PULocationID,DOLocationID
0,161,141
1,43,237


In [67]:
records = categorical.to_dict(orient="records")
records[0]

{'PULocationID': '161', 'DOLocationID': '141'}

In [68]:
# Fit a dictionary vectorizer
dv = DictVectorizer(sparse=False)
X = dv.fit_transform(records)
X.shape

(3009173, 515)

In [69]:
lr = LinearRegression()

In [None]:
lr.fit(X, df['duration_minutes'])

In [19]:
df['y_pred'] = lr.predict(X)
root_mean_squared_error(df['duration_minutes'], df['y_pred'])

7.501280443041593

In [23]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [24]:
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')
df_val['duration'] = df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']
df_val['duration_minutes'] = df_val['duration'].dt.total_seconds() / 60

In [25]:
df_val['PULocationID'] = df_val['PULocationID'].astype(str)
df_val['DOLocationID'] = df_val['DOLocationID'].astype(str)
df_val_x = df_val[['PULocationID', 'DOLocationID']]

In [27]:
records = df_val_x.to_dict(orient="records")
# Fit a dictionary vectorizer
dv = DictVectorizer(sparse=False)
X = dv.fit_transform(records)
df_val['y_pred'] = lr.predict(X)

ValueError: X has 519 features, but LinearRegression is expecting 507 features as input.

In [None]:
df_val['y_true'] = df_val['duration_minutes']
root_mean_squared_error(df_val['y_true'], df_val['y_pred'])