In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('advertising.csv')

In [3]:
df.shape

(200, 5)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [5]:
df = df.iloc[:,1:]

In [6]:
df.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [7]:
X, y = df.iloc[:,:-1] , df.iloc[:,-1]

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=234)

## Early stopping using custom code

**A very different way to regularize iterative learning algorithms such as gradient descent is to stop the training as soon as the validation error reaches a minimum. This is called an early stopping or beautiful free lunch.**

**As the epochs go by the algorithm learns, and its predictions(MSE) on the training set goes down, along with its prediction error on the validation error on the validation set. After a while though, the validation error stops decreasing and starts to go back up. This indicates that the model has started to overfit the training data. With early stopping you just stop training as soon as the validation error reaches the minimum.**

**NOTE: Epoch is a machine learning term that refers to the number of passes the training data goes through the algorithm** 

In [9]:
from sklearn.pipeline import Pipeline 
from sklearn.base import clone
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [10]:
poly_scalar = Pipeline([
    ('poly_feat',PolynomialFeatures(degree=3,include_bias=False)),
    ('std_scalar',StandardScaler())
])

X_train_transformed = poly_scalar.fit_transform(X_train)
X_test_transformed = poly_scalar.transform(X_test)

In [11]:
reg = SGDRegressor(max_iter=1,tol=-np.infty,warm_start=True,penalty=None,learning_rate='constant',eta0=0.0005)

In [12]:
minimum_value_error = float('inf')
best_epoch = None
best_model = None
for epoch in range(1000):
    reg.fit(X_train_transformed,y_train)
    y_pred = reg.predict(X_test_transformed)
    val_error = mean_squared_error(y_test,y_pred)
    if val_error < minimum_value_error:
        minimum_value_error = val_error
        best_epoch = epoch
        best_model = clone(reg)

In [13]:
## creating a function to find the r2 adjusted score

def r2_adj(r):
    
    n = df.shape[0]
    m = df.shape[1]
    
    result =  1 - ((1-r**2)*((n-1)/(n-m-1)))
    return result

In [14]:
print(f"The r2 score of the stocastic gradient descent model is {np.round(r2_score(y_test,y_pred),5)}")
print(f"The r2 score of the stocastic gradient descent model is {np.round(r2_adj(r2_score(y_test,y_pred)),5)}")

The r2 score of the stocastic gradient descent model is 0.99181
The r2 score of the stocastic gradient descent model is 0.98335


## Early stopping using the scikit-learn

In [15]:
reg2 = SGDRegressor(random_state=2374,learning_rate='constant',early_stopping=True,penalty=None)

In [16]:
poly_scalar2 = Pipeline([
    ('std_scalar',StandardScaler())
])

In [17]:
X_train_scaled = poly_scalar2.fit_transform(X_train)

In [18]:
param = {}
param['max_iter'] = np.arange(100,2000,100)
param['tol'] = [1e-4,1e-3, 1e-2,1e-1]
param['eta0'] = np.arange(0.1,1.1,0.1)

In [19]:
from sklearn.model_selection import RandomizedSearchCV

r = RandomizedSearchCV(reg2,param,random_state=23409)
r.fit(X_train_scaled,y_train)

RandomizedSearchCV(estimator=SGDRegressor(early_stopping=True,
                                          learning_rate='constant',
                                          penalty=None, random_state=2374),
                   param_distributions={'eta0': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                                        'max_iter': array([ 100,  200,  300,  400,  500,  600,  700,  800,  900, 1000, 1100,
       1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900]),
                                        'tol': [0.0001, 0.001, 0.01, 0.1]},
                   random_state=23409)

In [20]:
r.best_params_

{'tol': 0.001, 'max_iter': 1300, 'eta0': 0.2}

In [26]:
reg2.set_params(tol=0.001,max_iter=1300,eta0=0.2)

SGDRegressor(early_stopping=True, eta0=0.2, learning_rate='constant',
             max_iter=1300, penalty=None, random_state=2374)

In [27]:
reg2.fit(X_train_scaled,y_train)

SGDRegressor(early_stopping=True, eta0=0.2, learning_rate='constant',
             max_iter=1300, penalty=None, random_state=2374)

In [28]:
X_test_scaled = poly_scalar2.transform(X_test)

In [29]:
y_pred2 = reg2.predict(X_test_scaled)

In [30]:
print(f"The r2 score of the stocastic gradient descent model is {np.round(r2_score(y_test,y_pred2),5)}")
print(f"The r2 score of the stocastic gradient descent model is {np.round(r2_adj(r2_score(y_test,y_pred2)),5)}")

The r2 score of the stocastic gradient descent model is 0.91237
The r2 score of the stocastic gradient descent model is 0.82898
