In [3]:
import numpy as np
import matplotlib.pyplot as plt
import csv

In [302]:
file_loc = "coding_interview_material/data/regression_train.csv"

with open(file_loc, 'r') as x:
    data = list(csv.reader(x, delimiter = ","))

data = np.array(data[1: ], dtype = float)[:, 1: ]

n = data.shape[0]
x = data[:, : -1]
y = data[:, -1].reshape((n, 1))

means = np.mean(x, axis = 0)
stds = np.std(x, axis = 0) 
x = (x - means)/stds

In [220]:
def mse(y_true, y_pred):
    n = y_true.shape[0]
    return (1/n) * np.sum((y_true - y_pred)**2)

In [221]:
def predict(beta, bias, x):
    return np.matmul(x, beta) + bias

In [222]:
def fit(x, y, l_rate = 1e-2, epochs = 1e4, rel_stop = 1e-4):
    n = x.shape[0]
    beta = np.zeros((x.shape[1], 1))
    bias = 0
    errors = []
    
    for i in range(int(epochs)):
        y_pred = predict(beta, bias, x)
        errors.append(mse(y, y_pred))
        if i > 1:
            rel_error = abs(errors[-2] - errors[-1])/errors[-2]
            if rel_error < rel_stop:
                print(f"Convergence after {i+1} epochs")
                break
            elif rel_error > 1:
                print(f"Learning rate too high, error diverging, reducing rate")
                l_rate /= 10
        beta -= (l_rate/n * np.matmul(x.T, y_pred - y))
        bias -= (l_rate/n * np.sum(y_pred - y))
        
    errors.append(mse(y, predict(beta, bias, x)))
    print(f"MSE: {errors[-1]}")
    return beta, bias, errors

In [517]:
def fit_quad_regularized(x, y, lamda = 1, l_rate = 0.1, epochs = 1e5, rel_stop = 1e-4):
    n = x.shape[0]
    beta = np.zeros((x.shape[1] * 2, 1))
    bias = 0
    errors = []
    x = np.hstack((x, x**2))
    
    for i in range(int(epochs)):
        y_pred = predict(beta, bias, x)
        errors.append(mse(y, y_pred))
        if i > 1:
            rel_error = abs(errors[-2] - errors[-1])/errors[-2]
            if rel_error < rel_stop:
                print(f"Convergence after {i+1} epochs")
                break
            elif rel_error > 1:
                print(f"Learning rate too high, error diverging, reducing rate")
                l_rate /= 10
        beta -= (l_rate * (1/n * np.matmul(x.T, y_pred - y) + lamda * beta))
        bias -= (l_rate/n * np.sum(y_pred - y))
        
    errors.append(mse(y, predict(beta, bias, x)))
    print(f"MSE: {errors[-1]}")
    return beta, bias, errors

In [518]:
# beta, bias, errors = fit(x, y)
beta, bias, errors = fit_quad_regularized(x, y)

Convergence after 142 epochs
MSE: 4.978009940609211


In [519]:
beta, bias

(array([[ 0.00905922],
        [ 1.04687001],
        [ 0.22540585],
        [-0.06685039],
        [ 0.21733617],
        [-0.05271164]]),
 26.484313305251746)

In [520]:
with open(file_loc.replace("train", "test"), 'r') as f:
    test = list(csv.reader(f, delimiter = ","))

test = np.array(test[1: ], dtype = float)[:, 1: ]

n_test = test.shape[0]
x_test = test[:, : -1]
y_test = test[:, -1].reshape((n_test, 1))

x_test = (x_test - means)/stds

In [521]:
x_test

array([[ 1.28125466,  0.85508321,  0.19025511],
       [ 0.66498203, -0.07752632,  1.0081822 ],
       [-0.90211123, -1.3169153 , -0.71855277],
       [ 1.08756898,  0.1924396 , -0.08238726],
       [-0.51473986, -0.10206868, -0.71855277],
       [-0.51473986, -0.2002381 , -1.17295671],
       [ 0.10153277, -0.40884813, -0.26414883],
       [-0.09215292, -0.88742407, -0.71855277],
       [-0.9549346 ,  1.78769274,  1.82610929],
       [ 0.01349382, -0.23705163, -1.2638375 ]])

In [522]:
# mse(y_test, predict(beta, bias, x_test))
mse(y_test, predict(beta, bias, np.hstack((x_test, x_test**2))))

3.06820517751827

In [523]:
# predict(beta, bias, x_test) - y_test
predict(beta, bias, np.hstack((x_test, x_test**2))) - y_test

array([[ 1.58122464],
       [ 1.3545949 ],
       [ 1.23083532],
       [-1.79432573],
       [ 0.56816728],
       [-1.77588486],
       [ 0.02964371],
       [ 0.33586805],
       [ 2.21659567],
       [-3.62059789]])

In [185]:
x_test

array([[165.2,  60.3,  37. ],
       [161.7,  52.7,  46. ],
       [152.8,  42.6,  27. ],
       [164.1,  54.9,  34. ],
       [155. ,  52.5,  27. ],
       [155. ,  51.7,  22. ],
       [158.5,  50. ,  32. ],
       [157.4,  46.1,  27. ],
       [152.5,  67.9,  55. ],
       [158. ,  51.4,  21. ]])