In [92]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
import math

In [93]:
data = np.genfromtxt('boston_train.csv', delimiter=',')
x_test = np.genfromtxt('boston_test.csv', delimiter=',')

In [94]:
x_train = data[:, :-1]
y_train = data[:, -1]
x = x_train
x_train.shape, y_train.shape

((379, 13), (379,))

In [95]:
def addFeatures2Degree(x):  #X1^2 X2^2 ... 2Degree
    df = pd.DataFrame(x)
    i = 0
    l = len(df.columns)

    while i < l:
        j = i
        while j < l:
            string = str(df.columns[i]) + "*" + str(df.columns[j])
            df[string] = df[df.columns[i]] * df[df.columns[j]]
            j += 1
        i += 1
    len(df.columns)

    return np.array(df)

In [99]:
def addFeatures3Degree(x):
    df_train = pd.DataFrame(x)
    i = 0
    n = len(df_train.columns)
    l = df_train.columns
    while i < n:
        j = i
        while j < n:
            df_train[str(l[i]) + '*' + str(l[j])] = df_train[l[i]]*df_train[l[j]]
            k = j
            while k < n:
                df_train[str(l[i]) + '*' + str(l[j]) + '*' + str(l[j])] = df_train[l[i]]*df_train[l[j]]*df_train[l[k]]
                k += 1
            j += 1
        i += 1
    return np.array(df_train)

In [100]:
def addFeaturesPowers(x, k=10):
    df = pd.DataFrame(x)
    
    for i in range(len(df.columns)):
        string = str(df.columns[i])
        x = df[df.columns[i]]
        for j in range(2, k+1):
            string += '*' + str(df.columns[i])
            x = x*df[df.columns[i]]
            df[string] = x
    
    return np.array(df)

In [101]:
# x_train = addFeatures2Degree(x_train)
# x_test = addFeatures2Degree(x_test)
x_train = addFeatures3Degree(x_train)
x_test = addFeatures3Degree(x_test)
x_train.shape, x_test.shape

((379, 195), (127, 195))

In [102]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [103]:
x_train = np.insert(x_train, 0, 1, axis=1)
x_test = np.insert(x_test, 0, 1, axis=1)

In [104]:
def cost(x, y, coef):
    return ((y - x.dot(coef))**2).mean()

In [105]:
def score(x_test, y_test, m):
    y_pred = predict(x_test, m)
    num = np.sum((y_pred - y_test) ** 2, axis=0)
    den = np.sum((y_test - y_test.mean()) ** 2, axis=0)
    return 1 - num/den

In [106]:
def predict(x_test, m):
    return x_test.dot(m)

In [113]:
def step_gradient(x, y, learning_rate, m):
    k = x.shape[0]
    slope_m = np.zeros(x.shape[1])
    for i in range(k):
        slope_m += (2/k)*x[i]*(x[i].dot(m) - y[i])
        if i%10 == 0 or i == k-1:
            m = m - learning_rate*slope_m
            slope_m = np.zeros(x.shape[1])
    return m

In [114]:
def gradient_descent(x, y, learning_rate=0.001, num_iter=1000):
    coef = np.zeros((x.shape[1],))
    
    print("start : ", cost(x, y, coef))
    
    for i in range(num_iter):
        coef = step_gradient(x, y, learning_rate, coef)
        if i % (num_iter//10) == 0:
            print(i, " : ", cost(x, y, coef))
            
    print("end : ", cost(x, y, coef))
    
    return coef

In [115]:
def run(x, y):
    learning_rate = 0.01
    num_iter = 400
    return gradient_descent(x, y, learning_rate, num_iter)

In [116]:
m = run(x_train, y_train)

start :  599.1222691292876
0  :  545.2646864676817
40  :  110.59257582746504
80  :  29.652181639055105
120  :  12.831153539431487
160  :  9.058788506012867
200  :  8.014903609925472
240  :  7.584705879904285
280  :  7.320540636666123
320  :  7.1194205047968975
360  :  6.952657790223098
end :  6.812623545779593


In [117]:
m

array([ 2.25994351e+01, -8.64390635e-02,  2.30965240e-01, -1.63372544e-01,
        4.92888544e-01, -4.84747828e-01,  2.31483780e+00, -2.88042274e-01,
       -9.65065568e-01,  4.77084400e-01, -4.91028567e-01, -6.68288440e-01,
        3.12805012e-01, -1.00756739e+00,  3.39104305e-02,  1.52030811e-01,
       -6.45990745e-02,  6.74368297e-02,  2.21705112e-02, -8.49991139e-02,
        9.97949326e-01, -4.51355903e-01, -7.42802883e-02, -3.24124421e-01,
       -7.57159224e-02,  2.90588461e-01,  1.19738009e-01, -1.99575477e-02,
        9.37278175e-02, -3.10037699e-02, -2.32375531e-01, -6.57642333e-02,
        4.50897998e-02, -9.84756980e-02,  7.28056644e-02, -1.16917406e-01,
        2.07705974e-01, -3.81429221e-01,  5.09623486e-02,  9.10439920e-02,
       -3.14648378e-02, -1.35834717e-01, -1.53699537e-02,  1.58718824e-01,
       -9.51480429e-02, -1.41128416e-02,  1.61940317e-01,  9.50713076e-02,
       -1.83744681e-01,  6.01940222e-02, -4.31812826e-02,  4.35775293e-01,
       -2.59124008e-02, -

In [118]:
y_pred = predict(x_test, m)
y_pred

array([15.82662544, 28.18652236, 19.44646796, 22.06767934, 21.08959975,
       14.54236929, 27.78229772, 23.79240438, 17.7445855 , 21.07021807,
       23.17875008, 17.1813805 , 19.21825674, 20.76525163, 44.74860501,
       21.33264666, 22.84855877, 25.80322458, 18.98130237, 30.87391322,
       21.75173557, 21.82412697, 38.41244628, 35.25941821, 33.80569295,
       19.14238361, 20.55421556, 27.40572523, 20.09297177, 28.69504601,
       17.35770818, 23.05834424, 21.83734162, 23.85567948, 10.28581298,
       26.16199476, 23.97682951, 19.28926464, 21.52193938, 11.96395324,
       11.41841371, 25.68341484, 23.92067015, 19.82440083, 20.04172477,
        7.66880335, 44.9119614 , 23.93165087, 35.48910884, 12.15408956,
       16.69453183, 46.4230918 , 12.66872034, 20.11239203, 15.86170666,
       21.29817315, 19.49314894, 22.45784938, 15.39632344, 14.35364401,
       11.05132063, 26.80621948, 22.77807195, 24.02787866, 16.3463275 ,
       13.1087922 , 34.06929757, 14.01627538, 21.90396895, 22.45

In [119]:
score(x_train, y_train, m)

0.922524694308928

In [120]:
np.savetxt('test_boston_y_test.csv', y_pred, delimiter=',')