In [1]:
!python -V

Python 3.9.13


In [2]:
# data manipulation and storage
import pandas as pd

# plotting and graphs
import seaborn as sns
import matplotlib.pyplot as plt

# data preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

# model performance metrics
from sklearn.metrics import mean_squared_error

# saving model to file
import pickle


In [3]:
df_train = pd.read_parquet('./data/train_data.parquet')
df_val = pd.read_parquet('./data/validation_data.parquet')

In [4]:
categorical = ['PU_DO_pair']
# ['PULocationID','DOLocationID']
numerical = ['trip_distance','fare_amount']
target = 'duration'
# Pre Processing - Numerical
scaler = StandardScaler()
df_train[numerical] = scaler.fit_transform(df_train[numerical])
df_val[numerical] = scaler.transform(df_val[numerical])
train_dicts = df_train[categorical+numerical].to_dict(orient='records')
val_dicts = df_val[categorical+numerical].to_dict(orient='records')
# Pre Processing - Categorical
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_train[target].values
X_val = dv.transform(val_dicts)
y_val = df_val[target].values

In [5]:
# Fit Linear Regression Model and save fitted model under /models folders
linear_model = LinearRegression()
linear_model.fit(X_train,y_train)
y_pred_linear = linear_model.predict(X_val)
mean_squared_error(y_val,y_pred_linear,squared=False)
with open('models/linear_model.bin','wb') as f_out:
    pickle.dump((scaler,dv,linear_model),f_out)

In [6]:
# Fit Ridge Regression Model and save fitted model under /models folders
ridge_model = Ridge(alpha=0.0001)
ridge_model.fit(X_train,y_train)
y_pred_ridge = ridge_model.predict(X_val)
mean_squared_error(y_val,y_pred_ridge,squared=False)
with open('models/ridge_model.bin','wb') as f_out:
    pickle.dump((scaler,dv,ridge_model),f_out)

In [7]:
# Fit Lasso Regression Model and save fitted model under /models folders
lasso_model = Lasso(alpha=0.0001)
lasso_model.fit(X_train,y_train)
y_pred_lasso = lasso_model.predict(X_val)
mean_squared_error(y_val,y_pred_lasso,squared=False)
with open('models/lasso_model.bin','wb') as f_out:
    pickle.dump((scaler,dv,lasso_model),f_out)