In [1]:
import os, sys
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)
    
import pandas as pd
import numpy as np
import time

from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (root_mean_squared_error, 
                             r2_score, 
                             root_mean_squared_log_error, 
                             max_error, 
                             explained_variance_score,
                             median_absolute_error, 
                             d2_absolute_error_score)

from src.python_scratch.linear_model import timeit, ScratchSGDRegressor

In [6]:
df = pd.read_csv('..\\data\\regression\\AMES_Final_DF.csv')
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
sk_model = SGDRegressor(
    loss="squared_error",
    penalty=None,
    learning_rate="constant",
    eta0=1e-4,
    max_iter=300,
    shuffle=True,
    random_state=42,
    fit_intercept=True
)
start = time.perf_counter()                         # time
sk_model.fit(X_train_scaled, y_train)
fit_time_sk = time.perf_counter() - start           # time
start = time.perf_counter()                         # time
sk_y_pred = sk_model.predict(X_test_scaled)
pred_time_sk = time.perf_counter() - start          # time

sk_RMSE = root_mean_squared_error(y_test, sk_y_pred)
sk_R2 = r2_score(y_test, sk_y_pred)
sk_D2 = d2_absolute_error_score(y_test, sk_y_pred)
sk_MAXE = max_error(y_test, sk_y_pred)
sk_MedAE = median_absolute_error(y_test, sk_y_pred)
sk_EVS = explained_variance_score(y_test, sk_y_pred)
sklearn_stack_SGD = {
    'RMSE': sk_RMSE,
    'R2': sk_R2,
    'D2': sk_D2,
    'MAXE': sk_MAXE,
    'MedAE': sk_MedAE,
    'EVS': sk_EVS,
    'fit_time': fit_time_sk,
    'pred_time': pred_time_sk
    }

In [21]:
scr_model = ScratchSGDRegressor(max_iter=300, lr=1e-4, random_state=42, mode='sgd')
start = time.perf_counter()                     # time
scr_model.fit(X_train_scaled, y_train)
fit_time_scr = time.perf_counter() - start      # time
start = time.perf_counter()                     # time
scr_y_pred = scr_model.predict(X_test_scaled)
pred_time_scr = time.perf_counter() - start     # time

scr_RMSE = root_mean_squared_error(y_test, scr_y_pred)
scr_R2 = r2_score(y_test, scr_y_pred)
scr_D2 = d2_absolute_error_score(y_test, scr_y_pred)
scr_MAXE = max_error(y_test, scr_y_pred)
scr_MedAE = median_absolute_error(y_test, scr_y_pred)
scr_EVS = explained_variance_score(y_test, scr_y_pred)

scratch_stack_sgd = {
    'RMSE': scr_RMSE,
    'R2': scr_R2,
    'D2': scr_D2,
    'MAXE': scr_MAXE,
    'MedAE': scr_MedAE,
    'EVS': scr_EVS,
    'fit_time': fit_time_scr,
    'pred_time': pred_time_scr
}

100%|██████████| 300/300 [01:39<00:00,  3.03it/s]


In [20]:
scr_model = ScratchSGDRegressor(max_iter=50, lr=0.1, random_state=42, mode='batch')
start = time.perf_counter()                     # time
scr_model.fit(X_train_scaled, y_train)
fit_time_scr = time.perf_counter() - start      # time
start = time.perf_counter()                     # time
scr_y_pred = scr_model.predict(X_test_scaled)
pred_time_scr = time.perf_counter() - start     # time

scr_RMSE = root_mean_squared_error(y_test, scr_y_pred)
scr_R2 = r2_score(y_test, scr_y_pred)
scr_D2 = d2_absolute_error_score(y_test, scr_y_pred)
scr_MAXE = max_error(y_test, scr_y_pred)
scr_MedAE = median_absolute_error(y_test, scr_y_pred)
scr_EVS = explained_variance_score(y_test, scr_y_pred)

scratch_stack_batch = {
    'RMSE': scr_RMSE,
    'R2': scr_R2,
    'D2': scr_D2,
    'MAXE': scr_MAXE,
    'MedAE': scr_MedAE,
    'EVS': scr_EVS,
    'fit_time': fit_time_scr,
    'pred_time': pred_time_scr
}

100%|██████████| 50/50 [00:13<00:00,  3.66it/s]


In [18]:
scr_model = ScratchSGDRegressor(max_iter=50, lr=0.01, random_state=42, mode='minibatch', batch_size=100)
start = time.perf_counter()                     # time
scr_model.fit(X_train_scaled, y_train)
fit_time_scr = time.perf_counter() - start      # time
start = time.perf_counter()                     # time
scr_y_pred = scr_model.predict(X_test_scaled)
pred_time_scr = time.perf_counter() - start     # time

scr_RMSE = root_mean_squared_error(y_test, scr_y_pred)
scr_R2 = r2_score(y_test, scr_y_pred)
scr_D2 = d2_absolute_error_score(y_test, scr_y_pred)
scr_MAXE = max_error(y_test, scr_y_pred)
scr_MedAE = median_absolute_error(y_test, scr_y_pred)
scr_EVS = explained_variance_score(y_test, scr_y_pred)

scratch_stack_minibatch = {
    'RMSE': scr_RMSE,
    'R2': scr_R2,
    'D2': scr_D2,
    'MAXE': scr_MAXE,
    'MedAE': scr_MedAE,
    'EVS': scr_EVS,
    'fit_time': fit_time_scr,
    'pred_time': pred_time_scr
}

100%|██████████| 50/50 [00:13<00:00,  3.59it/s]


In [24]:
stack = pd.DataFrame(data = [sklearn_stack_SGD, scratch_stack_sgd, scratch_stack_batch, scratch_stack_minibatch],
                     index=['sklearn', 'scratch_SGD', 'scratch_batch', 'scratch_minibatch'])
stack

Unnamed: 0,RMSE,R2,D2,MAXE,MedAE,EVS,fit_time,pred_time
sklearn,21324.471909,0.929315,0.739998,156304.745809,10803.112127,0.92934,0.163103,0.001189
scratch_SGD,21364.122665,0.929052,0.739326,155701.105966,11040.891428,0.929084,99.108268,0.058748
scratch_batch,21537.689173,0.927895,0.738676,158379.134882,10885.598529,0.927922,13.66462,0.049173
scratch_minibatch,21334.075047,0.929251,0.73891,154059.110644,10773.805977,0.92928,13.922287,0.048415
