In [1]:
!python -V

Python 3.8.12


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [3]:
# "For-Hire Vehicle Trip Records". January and February 2021.

# Q1. Downloading the data
# Read the data for January. How many records are there?
df_jan = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')
df_jan.count()

dispatching_base_num      1154112
pickup_datetime           1154112
dropOff_datetime          1154112
PUlocationID               195845
DOlocationID               991892
SR_Flag                         0
Affiliated_base_number    1153227
dtype: int64

In [4]:
# Q2. Computing duration
# duration variable. It should contain the duration of a ride in minutes. What's the average trip duration in January?
df_jan['duration'] = df_jan.dropOff_datetime - df_jan.pickup_datetime
df_jan.duration = df_jan.duration.apply(lambda td: td.total_seconds() / 60)
df_jan['duration'].mean()

19.1672240937939

In [5]:
# Data preparation
df_jan['duration'].describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

In [6]:
# keep only the records where the duration was between 1 and 60 minutes (inclusive).
df_jan = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]
df_jan['duration'].describe()

count    1.109826e+06
mean     1.624725e+01
std      1.155150e+01
min      1.000000e+00
25%      7.850000e+00
50%      1.323333e+01
75%      2.146667e+01
max      6.000000e+01
Name: duration, dtype: float64

In [7]:
# Q3. Missing values
# lot of missing values there. Let's replace them with "-1".
categorical = ['PUlocationID', 'DOlocationID']
df_trainX = df_jan[categorical].fillna(-1)

In [8]:
# Question 3: Fraction of missing values
# Taking the max missing values and dividing by the total record count
print(df_trainX.count())
df_na = df_trainX[(df_trainX['PUlocationID'] == -1)]
print(df_trainX.shape[0])
print(df_na.shape[0])
print(df_na.shape[0] / df_trainX.shape[0] )

PUlocationID    1109826
DOlocationID    1109826
dtype: int64
1109826
927008
0.8352732770722617


In [34]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].fillna(-1)
    print(df.shape)
    df[categorical] = df[categorical].astype(str)
    print(df.shape)

    return df

In [35]:
df_train = read_dataframe('./data/fhv_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/fhv_tripdata_2021-02.parquet')

(1109826, 8)
(1109826, 8)
(990113, 8)
(990113, 8)


In [37]:
df_train.columns

Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number',
       'duration'],
      dtype='object')

In [43]:
# Q4. One-hot encoding
# apply one-hot encoding to the pickup and dropoff location IDs
# Turn the dataframe into a list of dictionaries
# Fit a dictionary vectorizer
# Get a feature matrix from it

# What's the dimensionality of this matrix?
categorical = ['PUlocationID', 'DOlocationID']
numerical = ['duration']

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

print(X_train.shape)


(1109826, 525)


In [46]:
y_train = df_train['duration'].values
y_val = df_val['duration'].values

In [45]:
# Q5. Training a model
# plain linear regression model with default parameters. Calculate the RMSE of the model on the training data

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

10.528519107211691

In [47]:
# Q6. Evaluating the model. apply this model to the validation dataset (Feb 2021).
y_val_pred = lr.predict(X_val)

mean_squared_error(y_val, y_val_pred, squared=False)

11.014283200530928