## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from sklearn.preprocessing import PolynomialFeatures
data = pd.read_csv("insurance.txt")

## Class for Models

In [2]:
class Models:
    def __init__(self, data):
        self.data = data.drop(columns = ["children"]).sample(frac=1)
        self.GDModels = pd.DataFrame(columns = ["training_error", "testing_error", "validation_error", "degree", "l1", "l2", "iterations"])
        self.SGDModels = pd.DataFrame(columns = ["training_error", "testing_error", "validation_error", "degree", "l1", "l2", "iterations"])
  
    def extractFeatures(self):
        x1 = self.data["age"]
        x1 = np.array(x1)
        x1 = x1.reshape((len(x1), 1))
        x2 = self.data["bmi"]
        x2 = np.array(x2)
        x2 = x2.reshape((len(x2), 1))
        Y = self.data["charges"]
        Y = np.array(Y)
        Y = Y.reshape((len(Y), 1))
        return np.concatenate((x1, x2), axis=1), Y

    def polyFeatures(self, degree, X):
        poly = PolynomialFeatures(degree)
        return poly.fit_transform(X)
  
    def normalize(self, X):
        norm = lambda x: ((x - x.min()) / (x.max() - x.min()))
        return norm(X)
  
    def split(self, X):
        X_train = X[:int(0.7 * len(X))]
        X_val = X[int(0.7 * len(X)):int(0.9 * len(X))]
        X_test = X[int(0.9 * len(X)):]
        return X_train, X_val, X_test
  
    # Gradient Descent Algorithm
    def GD(self, X, X_val, Y, Y_val, l1 = 0, l2 = 0, learning_rate = 1e-5):
        sgn = lambda x: (x / abs(x)) #signum function
        W = np.random.randn(X.shape[1])
        W = W.reshape((X.shape[1], 1))
        prev_error, error = float("0"), float("inf")
        count = 0
        while abs(prev_error - error) >= 1e-5 and count < 25000 - 1:
            Y_pred = X @ W
            hypothesis = (Y_pred - Y)
            prev_error = error
            error = 0.5 * (np.sum(np.square(hypothesis)) + l1 * np.sum(np.abs(W)) + l2 * np.sum(np.square(W)))
            sgn_w = np.array([sgn(w) for w in W]).reshape((X.shape[1], 1))
            W -= learning_rate * ((X.T @ hypothesis) + 0.5 * l1 * sgn_w + l2 * W)
            count += 1
            if count % 5000 == 0:
                print(f"Epochs = {count} || Error = {error} || L1 = {l1} || L2 = {l2}")
        val_D = ((X_val @ W) - Y_val)
        val_loss = 0.5 * (np.sum(np.square(val_D)) + l1 * np.sum(np.abs(W)) + l2 * np.sum(np.square(W)))
        return error, val_loss, W, count + 1
  
    def SGD(self, X, X_val, Y, Y_val, l1 = 0, l2 = 0, learning_rate = 1e-7):
        sgn = lambda x: (x / abs(x)) #signum function
        W = np.random.randn(X.shape[1])
        W = W.reshape((X.shape[1], 1))
        prev_error, error = float("0"), float("inf")
        count = 0
        while abs(prev_error - error) >= 1e-6 and count < 25000 - 1:
            x = X[count%X.shape[0]].reshape(1, X.shape[1])
            y = Y[count%X.shape[0]].reshape(1, 1)
            hypo = ((X @ W) - Y)
            hypothesis = ((x @ W) - y)
            prev_error = error
            error = 0.5 * (np.sum(np.square(hypo)) + l1 * np.sum(np.abs(W)) + l2 * np.sum(np.square(W)))
            sgn_w = np.array([sgn(w) for w in W]).reshape((X.shape[1], 1))
            W -= learning_rate * ((x.T @ hypothesis) + 0.5 * l1 * sgn_w + l2 * W)
            count += 1
            if count % 5000 == 0:
                print(f"Iterations = {count} || Error = {error} || L1 = {l1} || L2 = {l2}")
        val_D = ((X_val @ W) - Y_val)
        val_loss = 0.5 * (np.sum(np.square(val_D)) + l1 * np.sum(np.abs(W)) + l2 * np.sum(np.square(W)))
        return error, val_loss, W, count + 1
  
    def plot(self, X_feat, W, degree, method):
        x1 = X_feat[:,0]
        x2 = X_feat[:,1]
#         x1, x2 = np.meshgrid(x1, x2)
        X = self.polyFeatures(degree, X_feat)
        Y_pred = X @ W
        fig = plt.figure(figsize=(8,8))
        axes = fig.gca(projection='3d')
        axes.plot_trisurf(x1.flatten(), x2.flatten(), Y_pred.flatten(),cmap='viridis')
        axes.set_xlabel("X1")
        axes.set_ylabel("X2")
        axes.set_zlabel("Y pred")
        name = str(degree) + method
        axes.figure.savefig(f"{name}.jpg")
        plt.close()
  
    def buildModels(self, low = 1, high = 10):
        for degree in range(low, high + 1):
            X_feat, Y_feat = self.extractFeatures()
            X = self.polyFeatures(degree, X_feat)
            X = self.normalize(X)
            Y = self.normalize(Y_feat)
            X_train, X_val, X_test = self.split(X)
            Y_train, Y_val, Y_test = self.split(Y)
            print(f"Gradient Descent for Regression without Regularisation || model degree {degree}")
            training_error, val_loss, W , i = self.GD(X_train, X_val, Y_train, Y_val)
            test_D = ((X_test @ W) - Y_test)
            testing_error = error = 0.5 * (np.sum(np.square(test_D)))
            GDModel = [training_error, testing_error, val_loss, degree, 0, 0, i]
            GDModel = pd.Series(GDModel, index = self.GDModels.columns)
            self.GDModels = self.GDModels.append(GDModel, ignore_index=True)
            self.plot(X_feat, W, degree, "GD")
            # GD Lasso Regression
            print(f"Gradient Descent for Lasso Regression || model degree {degree}")
            L1 = list(np.random.rand(5))
            min_val_loss = float("inf")
            training_error_final = float("inf")
            W_final = W
            l1_final = 0
            i_final = 0
            for l1 in L1:
                training_error, val_loss, W, i = self.GD(X_train, X_val, Y_train, Y_val, l1=l1)
                if min_val_loss > val_loss:
                    min_val_loss = val_loss
                    W_final = W
                    training_error_final = training_error
                    l1_final = l1
                    i_final = i
            test_D = ((X_test @ W_final) - Y_test)
            testing_error_final = error = 0.5 * (np.sum(np.square(test_D)) + l1_final * np.sum(np.abs(W_final)))
            GDModel1 = [training_error_final, testing_error_final, min_val_loss, degree, l1_final, 0, i_final]
            GDModel1 = pd.Series(GDModel1, index = self.GDModels.columns)
            self.GDModels = self.GDModels.append(GDModel1, ignore_index=True)
            # GD Ridge Regression
            print(f"Gradient Descent for Ridge Regression || model degree {degree}")
            L2 = list(np.random.rand(5))
            min_val_loss = float("inf")
            training_error_final = float("inf")
            W_final = W
            l2_final = 0
            i_final = 0
            for l2 in L2:
                training_error, val_loss, W, i = self.GD(X_train, X_val, Y_train, Y_val, l2=l2)
                if min_val_loss > val_loss:
                    min_val_loss = val_loss
                    W_final = W
                    training_error_final = training_error
                    l2_final = l2
                    i_final = i
            test_D = ((X_test @ W_final) - Y_test)
            testing_error_final = error = 0.5 * (np.sum(np.square(test_D)) + l2_final * np.sum(np.square(W_final)))
            GDModel2 = [training_error_final, testing_error_final, min_val_loss, degree, 0, l2_final, i_final]
            GDModel2 = pd.Series(GDModel2, index = self.GDModels.columns)
            self.GDModels = self.GDModels.append(GDModel2, ignore_index=True)
            # Stochastic Gradient Descent
            print(f"Stochastic Gradient Descent for Regression without Regularisation || model degree {degree}")
            training_error, val_loss, W, i = self.SGD(X_train, X_val, Y_train, Y_val) # gradient descent without regularisation
            test_D = ((X_test @ W) - Y_test)
            testing_error = error = 0.5 * (np.sum(np.square(test_D)))
            SGDModel = [training_error, testing_error, val_loss, degree, 0, 0, i]
            SGDModel = pd.Series(SGDModel, index = self.SGDModels.columns)
            self.SGDModels = self.SGDModels.append(SGDModel, ignore_index=True)
            # self.plot(X, Y, W, degree, "SGD")
            # SGD Lasso Regression
            print(f"Stochastic Gradient Descent for Lasso Regression || model degree {degree}")
            L1 = list(np.random.rand(5))
            min_val_loss = float("inf")
            training_error_final = float("inf")
            W_final = W
            l1_final = 0
            i_final = 0
            for l1 in L1:
                training_error, val_loss, W, i = self.SGD(X_train, X_val, Y_train, Y_val, l1=l1)
                if min_val_loss > val_loss:
                    min_val_loss = val_loss
                    W_final = W
                    training_error_final = training_error
                    l1_final = l1
                    i_final = i
            test_D = ((X_test @ W_final) - Y_test)
            testing_error_final = error = 0.5 * (np.sum(np.square(test_D)) + l1_final * np.sum(np.abs(W_final)))
            SGDModel1 = [training_error_final, testing_error_final, min_val_loss, degree, l1_final, 0, i_final]
            SGDModel1 = pd.Series(SGDModel1, index = self.SGDModels.columns)
            self.SGDModels = self.SGDModels.append(SGDModel1, ignore_index=True)
            # SGD Ridge Regression
            print(f"Stochastic Gradient Descent for Ridge Regression || model degree {degree}")
            L2 = list(np.random.rand(5))
            min_val_loss = float("inf")
            training_error_final = float("inf")
            W_final = W
            l2_final = 0
            i_final = 0
            for l2 in L2:
                training_error, val_loss, W, i = self.SGD(X_train, X_val, Y_train, Y_val, l2=l2)
                if min_val_loss > val_loss:
                    min_val_loss = val_loss
                    W_final = W
                    training_error_final = training_error
                    l2_final = l2
                    i_final = i
            test_D = ((X_test @ W_final) - Y_test)
            testing_error_final = error = 0.5 * (np.sum(np.square(test_D)) + l2_final * np.sum(np.square(W_final)))
            SGDModel2 = [training_error_final, testing_error_final, min_val_loss, degree, 0, l2_final, i_final]
            SGDModel2 = pd.Series(SGDModel2, index = self.SGDModels.columns)
            self.SGDModels = self.SGDModels.append(SGDModel2, ignore_index=True)

## Building Polynomial Models

In [3]:
models = Models(data)
models.buildModels(1, 10)

Gradient Descent for Regression without Regularisation || model degree 1
Epochs = 5000 || Error = 16.661326407327348 || L1 = 0 || L2 = 0
Epochs = 10000 || Error = 15.22383188739969 || L1 = 0 || L2 = 0
Epochs = 15000 || Error = 15.033578765525245 || L1 = 0 || L2 = 0
Gradient Descent for Lasso Regression || model degree 1
Epochs = 5000 || Error = 15.517149583168495 || L1 = 0.10089824574957695 || L2 = 0
Epochs = 10000 || Error = 15.15465654209816 || L1 = 0.10089824574957695 || L2 = 0
Epochs = 5000 || Error = 15.180666909972222 || L1 = 0.22907959663840138 || L2 = 0
Epochs = 5000 || Error = 20.00919478723806 || L1 = 0.40231361702723634 || L2 = 0
Epochs = 10000 || Error = 15.888562417933713 || L1 = 0.40231361702723634 || L2 = 0
Epochs = 15000 || Error = 15.38760689572249 || L1 = 0.40231361702723634 || L2 = 0
Epochs = 5000 || Error = 16.146812293996515 || L1 = 0.015396446757239834 || L2 = 0
Epochs = 10000 || Error = 15.165560295685335 || L1 = 0.015396446757239834 || L2 = 0
Epochs = 5000 || Er

Iterations = 5000 || Error = 643.0699525475534 || L1 = 0 || L2 = 0.6918925977025664
Iterations = 10000 || Error = 640.0146895820403 || L1 = 0 || L2 = 0.6918925977025664
Iterations = 15000 || Error = 636.9770090165671 || L1 = 0 || L2 = 0.6918925977025664
Iterations = 20000 || Error = 633.9577552384294 || L1 = 0 || L2 = 0.6918925977025664
Iterations = 5000 || Error = 55.22451510552095 || L1 = 0 || L2 = 0.7899179812019103
Iterations = 10000 || Error = 55.090727939772954 || L1 = 0 || L2 = 0.7899179812019103
Iterations = 15000 || Error = 54.95816711299813 || L1 = 0 || L2 = 0.7899179812019103
Iterations = 20000 || Error = 54.8262599964836 || L1 = 0 || L2 = 0.7899179812019103
Iterations = 5000 || Error = 323.1019857601998 || L1 = 0 || L2 = 0.3412549046847546
Iterations = 10000 || Error = 322.0911930887434 || L1 = 0 || L2 = 0.3412549046847546
Iterations = 15000 || Error = 321.08538770803665 || L1 = 0 || L2 = 0.3412549046847546
Iterations = 20000 || Error = 320.0843784318341 || L1 = 0 || L2 = 0

Epochs = 20000 || Error = 20.676602632049317 || L1 = 0.7487727879432509 || L2 = 0
Epochs = 5000 || Error = 18.312549557097935 || L1 = 0.021235333182590743 || L2 = 0
Epochs = 10000 || Error = 17.56749598942693 || L1 = 0.021235333182590743 || L2 = 0
Epochs = 15000 || Error = 17.26124809068805 || L1 = 0.021235333182590743 || L2 = 0
Epochs = 20000 || Error = 17.129304775544867 || L1 = 0.021235333182590743 || L2 = 0
Epochs = 5000 || Error = 29.026095340166513 || L1 = 0.9225122792623942 || L2 = 0
Epochs = 10000 || Error = 24.616984212291232 || L1 = 0.9225122792623942 || L2 = 0
Epochs = 15000 || Error = 22.8025051056388 || L1 = 0.9225122792623942 || L2 = 0
Epochs = 20000 || Error = 22.010729218543396 || L1 = 0.9225122792623942 || L2 = 0
Epochs = 5000 || Error = 22.19963329511876 || L1 = 0.8638910401134705 || L2 = 0
Epochs = 10000 || Error = 21.489117624381418 || L1 = 0.8638910401134705 || L2 = 0
Epochs = 15000 || Error = 21.185079627824777 || L1 = 0.8638910401134705 || L2 = 0
Epochs = 20000 |

Epochs = 15000 || Error = 24.12616040268721 || L1 = 0 || L2 = 0.2983016347447037
Epochs = 20000 || Error = 23.67250104077385 || L1 = 0 || L2 = 0.2983016347447037
Epochs = 5000 || Error = 23.186044057125102 || L1 = 0 || L2 = 0.3578484704574658
Epochs = 10000 || Error = 22.466561661123734 || L1 = 0 || L2 = 0.3578484704574658
Epochs = 15000 || Error = 22.012372121906214 || L1 = 0 || L2 = 0.3578484704574658
Epochs = 20000 || Error = 21.70216640528968 || L1 = 0 || L2 = 0.3578484704574658
Stochastic Gradient Descent for Regression without Regularisation || model degree 5
Stochastic Gradient Descent for Lasso Regression || model degree 5
Iterations = 5000 || Error = 432.1802981620844 || L1 = 0.7987926433761312 || L2 = 0
Iterations = 10000 || Error = 431.2758161258892 || L1 = 0.7987926433761312 || L2 = 0
Iterations = 15000 || Error = 430.37254075532286 || L1 = 0.7987926433761312 || L2 = 0
Iterations = 20000 || Error = 429.47239264159526 || L1 = 0.7987926433761312 || L2 = 0
Iterations = 5000 ||

Iterations = 20000 || Error = 60.20618983688773 || L1 = 0.701945323399332 || L2 = 0
Stochastic Gradient Descent for Ridge Regression || model degree 6
Iterations = 5000 || Error = 113.38996256804946 || L1 = 0 || L2 = 0.8029561807561756
Iterations = 10000 || Error = 113.08106630151292 || L1 = 0 || L2 = 0.8029561807561756
Iterations = 15000 || Error = 112.77368172526346 || L1 = 0 || L2 = 0.8029561807561756
Iterations = 20000 || Error = 112.4677978878756 || L1 = 0 || L2 = 0.8029561807561756
Iterations = 5000 || Error = 116.24689494812823 || L1 = 0 || L2 = 0.20522800434644284
Iterations = 10000 || Error = 116.06393631339925 || L1 = 0 || L2 = 0.20522800434644284
Iterations = 15000 || Error = 115.88095516888784 || L1 = 0 || L2 = 0.20522800434644284
Iterations = 20000 || Error = 115.6986895553932 || L1 = 0 || L2 = 0.20522800434644284
Iterations = 5000 || Error = 320.48330882830305 || L1 = 0 || L2 = 0.11720627045733956
Iterations = 10000 || Error = 320.1372682119094 || L1 = 0 || L2 = 0.1172062

Epochs = 10000 || Error = 41.86666629177276 || L1 = 0.9286970859763951 || L2 = 0
Epochs = 15000 || Error = 40.89753367719598 || L1 = 0.9286970859763951 || L2 = 0
Epochs = 20000 || Error = 40.10125626466257 || L1 = 0.9286970859763951 || L2 = 0
Epochs = 5000 || Error = 39.528689148203014 || L1 = 0.6372878864317686 || L2 = 0
Epochs = 10000 || Error = 38.28528681476612 || L1 = 0.6372878864317686 || L2 = 0
Epochs = 15000 || Error = 37.244091544226386 || L1 = 0.6372878864317686 || L2 = 0
Epochs = 20000 || Error = 36.36749078579892 || L1 = 0.6372878864317686 || L2 = 0
Epochs = 5000 || Error = 43.29311412268251 || L1 = 0.740153029636006 || L2 = 0
Epochs = 10000 || Error = 41.94936753205364 || L1 = 0.740153029636006 || L2 = 0
Epochs = 15000 || Error = 40.82860681698663 || L1 = 0.740153029636006 || L2 = 0
Epochs = 20000 || Error = 39.88366920326484 || L1 = 0.740153029636006 || L2 = 0
Gradient Descent for Ridge Regression || model degree 8
Epochs = 5000 || Error = 41.05962658838405 || L1 = 0 || L

Epochs = 5000 || Error = 45.34136428455445 || L1 = 0 || L2 = 0.6834238536984264
Epochs = 10000 || Error = 43.579568171221105 || L1 = 0 || L2 = 0.6834238536984264
Epochs = 15000 || Error = 42.02404767694773 || L1 = 0 || L2 = 0.6834238536984264
Epochs = 20000 || Error = 40.63521914687309 || L1 = 0 || L2 = 0.6834238536984264
Stochastic Gradient Descent for Regression without Regularisation || model degree 9
Stochastic Gradient Descent for Lasso Regression || model degree 9
Iterations = 5000 || Error = 47.459074572260576 || L1 = 0.6708547179262093 || L2 = 0
Iterations = 10000 || Error = 47.419395187848465 || L1 = 0.6708547179262093 || L2 = 0
Iterations = 15000 || Error = 47.38002881780507 || L1 = 0.6708547179262093 || L2 = 0
Iterations = 20000 || Error = 47.34072901408559 || L1 = 0.6708547179262093 || L2 = 0
Iterations = 5000 || Error = 53.54893156657791 || L1 = 0.5687077473522449 || L2 = 0
Iterations = 10000 || Error = 53.49830158621623 || L1 = 0.5687077473522449 || L2 = 0
Iterations = 15

## Models

In [4]:
models.GDModels

Unnamed: 0,training_error,testing_error,validation_error,degree,l1,l2,iterations
0,15.029274,2.676656,4.692667,1.0,0.0,0.0,15398.0
1,15.041453,2.688825,4.704846,1.0,0.015396,0.0,14440.0
2,15.059731,2.691367,4.71624,1.0,0.0,0.985359,5579.0
3,14.96707,2.554342,4.721181,2.0,0.0,0.0,12753.0
4,15.583766,2.987783,5.244362,2.0,0.305724,0.0,18979.0
5,15.088705,2.680408,4.838137,2.0,0.0,0.222962,3466.0
6,15.637402,2.568264,4.967355,3.0,0.0,0.0,19638.0
7,16.900286,3.592588,6.075878,3.0,0.300785,0.0,25000.0
8,17.214147,3.341626,6.014291,3.0,0.0,0.209735,25000.0
9,17.239426,2.797967,5.514245,4.0,0.0,0.0,25000.0


In [5]:
models.SGDModels

Unnamed: 0,training_error,testing_error,validation_error,degree,l1,l2,iterations
0,817.47985,108.84223,229.261763,1.0,0.0,0.0,308.0
1,67.267875,9.041979,18.533597,1.0,0.287887,0.0,108.0
2,22.710326,3.527573,6.805737,1.0,0.0,0.006887,3.0
3,54.213145,9.430062,17.588146,2.0,0.0,0.0,25.0
4,19.249783,3.614329,6.332257,2.0,0.229559,0.0,7.0
5,54.694241,11.220103,18.992485,2.0,0.0,0.789918,25000.0
6,142.365855,17.248347,40.70005,3.0,0.0,0.0,12.0
7,20.014335,4.014434,6.748319,3.0,0.197496,0.0,3.0
8,21.271077,4.643823,8.017882,3.0,0.0,0.291099,3.0
9,198.729311,23.614013,57.597105,4.0,0.0,0.0,12.0
