In [1]:
!python -V

Python 3.9.6


In [79]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import root_mean_squared_error

In [50]:
def read_data_frame(filename):

    df = pd.read_parquet(f"../data/{filename}")
    
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    categorical = ['PULocationID', 'DOLocationID']
    # numerical = ['trip_distance']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [51]:
df_train = read_data_frame('green_tripdata_2021-01.parquet')
df_val = read_data_frame('green_tripdata_2021-02.parquet')

In [52]:
len(df_train), len(df_val)

(73908, 61921)

In [5]:
# df_dispatch = df[df.trip_type==2]

In [53]:
# sns.displot(df.duration)

In [54]:
# df.duration.describe()

In [55]:
# df.duration.describe(percentiles=[0.95, 0.98, 0.99])

In [56]:
## categorical shall be string
# df[categorical].dtypes

In [57]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [58]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [59]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

10.499110709994794

In [60]:
# dv.feature_names_

In [61]:
# sns.distplot(y_pred, label='prediction')
# sns.distplot(y_train, label='actual')

# plt.legend()


In [62]:
lasso = Lasso()
lasso.fit(X_train, y_train)

y_pred = lasso.predict(X_val)

root_mean_squared_error(y_val, y_pred)

12.212583224318818

In [64]:
ridge = Ridge()
ridge.fit(X_train, y_train)

y_pred = ridge.predict(X_val)

root_mean_squared_error(y_val, y_pred)

10.478171095742175

In [66]:
df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

In [67]:
categorical = ["PU_DO"]
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [68]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [69]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

7.7587152124560435

In [70]:
lasso = Lasso()
lasso.fit(X_train, y_train)

y_pred = lasso.predict(X_val)

root_mean_squared_error(y_val, y_pred)

12.212583224318818

In [80]:
lasso_alpha = Lasso(alpha=0.0001)
lasso_alpha.fit(X_train, y_train)

y_pred = lasso_alpha.predict(X_val)

root_mean_squared_error(y_val, y_pred)

7.616617761096093

In [71]:
ridge = Ridge()
ridge.fit(X_train, y_train)

y_pred = ridge.predict(X_val)

root_mean_squared_error(y_val, y_pred)

7.703735132744359

In [81]:
ridge_alpha = Ridge(alpha=0.01)
ridge_alpha.fit(X_train, y_train)

y_pred = ridge_alpha.predict(X_val)

root_mean_squared_error(y_val, y_pred)

7.51811006652168

In [82]:
with open('models/lasso.bin', 'wb') as f_output:
    pickle.dump((dv, lasso_alpha), f_output)

In [83]:
with open('models/ridge.bin', 'wb') as f_output:
    pickle.dump((dv, ridge_alpha), f_output)