In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df_jan = pd.read_parquet('data/fhv_tripdata_2021-01.parquet')

### Q1. Downloading the data

In [3]:
df_jan.shape[0]

1154112

### Q2. Computing duration

In [4]:
df_jan.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [5]:
df_jan.dropOff_datetime = pd.to_datetime(df_jan.dropOff_datetime)
df_jan.pickup_datetime = pd.to_datetime(df_jan.pickup_datetime)
df_jan['duration'] = df_jan.dropOff_datetime - df_jan.pickup_datetime
df_jan['duration'] = df_jan['duration'].apply(lambda td: td.total_seconds() / 60)

In [6]:
df_jan['duration'].mean()

19.1672240937939

### Data preparation

In [7]:
df_jan = df_jan.loc[(df_jan['duration'] >= 1) & (df_jan['duration'] <= 60)].copy()

In [8]:
df_jan.shape

(1109826, 8)

### Q3. Missing values

In [9]:
df_jan.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,,71.0,,B00037,9.05


In [10]:
df_jan.loc[:, ['PUlocationID', 'DOlocationID']] = df_jan.loc[:, ['PUlocationID', 'DOlocationID']].fillna(-1)

In [11]:
(df_jan['PUlocationID'] == -1).mean()

0.8352732770722617

### Q4. One-hot encoding

In [12]:
dv = DictVectorizer()
df_jan_dict = df_jan[['PUlocationID', 'DOlocationID']].astype('str').to_dict(orient='records')
df_jan_val = dv.fit_transform(df_jan_dict)

In [13]:
df_jan_val.shape

(1109826, 525)

### Q5. Training a model

In [14]:
lin_reg = LinearRegression()
lin_reg.fit(df_jan_val, df_jan['duration'])

LinearRegression()

In [15]:
y_pred = lin_reg.predict(df_jan_val)

In [16]:
mean_squared_error(df_jan['duration'], y_pred)**0.5

10.528519107205959

### Q6. Evaluating the model

In [17]:
df_feb = pd.read_parquet('data/fhv_tripdata_2021-02.parquet')
df_feb.dropOff_datetime = pd.to_datetime(df_feb.dropOff_datetime)
df_feb.pickup_datetime = pd.to_datetime(df_feb.pickup_datetime)
df_feb['duration'] = df_feb.dropOff_datetime - df_feb.pickup_datetime
df_feb['duration'] = df_feb['duration'].apply(lambda td: td.total_seconds() / 60)
df_feb = df_feb.loc[(df_feb['duration'] >= 1) & (df_feb['duration'] <= 60)].copy()
df_feb.loc[:, ['PUlocationID', 'DOlocationID']] = df_feb.loc[:, ['PUlocationID', 'DOlocationID']].fillna(-1)
df_feb_dict = df_feb[['PUlocationID', 'DOlocationID']].astype('str').to_dict(orient='records')
df_feb_val = dv.transform(df_feb_dict)
y_valid_pred = lin_reg.predict(df_feb_val)
mean_squared_error(df_feb['duration'], y_valid_pred)**0.5

11.01428314240328