In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df_jan = pd.read_parquet("fhv_tripdata_2021-01.parquet")
print(f"Number of records in Jan 2021 FHV data: {len(df_jan)}")
df_jan.head()

Number of records in Jan 2021 FHV data: 1154112


Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [3]:
df_jan['duration'] = df_jan["dropOff_datetime"] - df_jan["pickup_datetime"]
df_jan['duration'] = df_jan["duration"].apply(lambda td: td.total_seconds() / 60)

print(f"Average duration in Jan 2021 FHV: {df_jan['duration'].mean()}")

Average duration in Jan 2021 FHV: 19.167224093791006


In [4]:
train_features = ["PUlocationID", "DOlocationID"]

In [5]:
df_jan = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]

In [6]:
# I computed fraction of rows with nan from all number of records
print(f"Fraction of missing values: { df_jan[train_features].isnull().sum().max() / df_jan.shape[0] }")

Fraction of missing values: 0.8352732770722617


In [7]:
df_jan = df_jan.fillna(-1)

In [8]:
train_dicts = df_jan[train_features].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [9]:
print(f"Dimensionality after OHE: {X_train.shape[-1]}")

Dimensionality after OHE: 2


In [10]:
target = 'duration'
y_train = df_jan[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

print(f"What's the RMSE on train? {mean_squared_error(y_train, y_pred, squared=False)}")

What's the RMSE on train? 11.415432830521663


In [11]:
df_feb = pd.read_parquet("fhv_tripdata_2021-02.parquet")
df_feb['duration'] = df_feb["dropOff_datetime"] - df_feb["pickup_datetime"]
df_feb['duration'] = df_feb["duration"].apply(lambda td: td.total_seconds() / 60)

df_feb = df_feb[(df_feb.duration >= 1) & (df_feb.duration <= 60)]
df_feb = df_feb.fillna(-1)
val_dicts = df_feb[train_features].to_dict(orient='records')


In [12]:
X_val = dv.transform(val_dicts)
y_val = df_feb[target].values
y_pred = lr.predict(X_val)
print(f"RMSE on validation: {mean_squared_error(y_val, y_pred, squared=False)}")

RMSE on validation: 11.85822362355935


## END