Prepared by: Ömer Coşkun <br>
*Quick note: This notebook is prepared to show how an acceptable solution should look like. Better solutions are always possible.*                         

In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt

# true_w = np.array([0.3, 1.55, 2.3, -0.4, -3.1, 3.3, 1.1])
# d = len(true_w)
# 
# points = []
# for i in range(100000):
#     x1, x2, x3, x4, x5 = np.random.randn(5)
#     y = (0.3 * x1) + (1.55 * x1**2) + (2.3 * x2) - (0.4 * x3) - (3.1 * x4) + (3.3 * x5) + 1.1 + np.random.randn()
#     points.append(((x1, x2, x3, x4, x5), y))
# 
# with open("dataset.pkl", "wb") as file:
#     pickle.dump(points, file)

## Loading and reshaping the dataset

In [2]:
with open("dataset.pkl", "rb") as file:
    points = pickle.load(file)
    
x1, x2, x3, x4, x5, y_true = [], [], [], [], [], []
for x, y in points:
    x1.append(x[0])
    x2.append(x[1])
    x3.append(x[2])
    x4.append(x[3])
    x5.append(x[4])
    y_true.append(y)

x1 = np.array(x1)
x2 = np.array(x2)
x3 = np.array(x3)
x4 = np.array(x4)
x5 = np.array(x5)

x1_sq = x1**2
x2_sq = x2**2
x3_sq = x3**2
x4_sq = x4**2
x5_sq = x5**2

X = np.c_[x1, x2, x3, x4, x5, x1_sq, x2_sq, x3_sq, x4_sq, x5_sq, np.ones(len(x1))]
print(X.shape)

(100000, 11)


## Custom sgd implementation

In [3]:
def loss(w):
    return sum([(w.dot(x.T) - y) ** 2 for x, y in zip(X, y_true)]) / len(X)

def stochastic_dF(w, i):
    x, y = X[i], y_true[i]
    return (w.dot(x.T)-y)*x

def stochastic_gradient_descent(loss, stochastic_dF, n):
    w = np.zeros(11)
    for t in range(25):
        for i in range(n):
            j = np.random.randint(0, high=n-1)
            gradient = stochastic_dF(w, j)
            eta = 0.00001
            w -= eta * gradient
        print("iteration {}: w = {}, F(w) = {}".format(t+1, np.round(w, 2), loss(w)))

stochastic_gradient_descent(loss, stochastic_dF, len(points))

iteration 1: w = [ 0.17  1.45 -0.25 -1.95  2.09  1.45  0.12  0.14  0.15  0.13  0.51], F(w) = 4.713347217024904
iteration 2: w = [ 0.25  1.98 -0.34 -2.68  2.84  1.58  0.07  0.08  0.08  0.08  0.64], F(w) = 1.5584604579315826
iteration 3: w = [ 0.28  2.18 -0.38 -2.94  3.13  1.59  0.05  0.05  0.06  0.05  0.75], F(w) = 1.1088771644071682
iteration 4: w = [ 0.3   2.25 -0.4  -3.04  3.23  1.58  0.04  0.05  0.04  0.04  0.83], F(w) = 1.0361429414198755
iteration 5: w = [ 0.3   2.28 -0.4  -3.07  3.28  1.58  0.03  0.03  0.04  0.03  0.89], F(w) = 1.0193378452377073
iteration 6: w = [ 0.31  2.29 -0.41 -3.09  3.29  1.57  0.03  0.03  0.03  0.02  0.94], F(w) = 1.0129879702686486
iteration 7: w = [ 0.31  2.29 -0.41 -3.1   3.3   1.56  0.02  0.02  0.02  0.02  0.98], F(w) = 1.0100658697879412
iteration 8: w = [ 0.31  2.29 -0.4  -3.1   3.3   1.56  0.01  0.02  0.02  0.01  1.  ], F(w) = 1.008377665487368
iteration 9: w = [ 0.3   2.29 -0.41 -3.09  3.3   1.56  0.01  0.01  0.02  0.01  1.03], F(w) = 1.00726860494

## You can refer to sklearn for the solution of Logistic Regression part, it is quite straightforward.