### Boston House Predictor

In this notebook we will use gradient descent for predicting house prices. We will use sklearn's boston houses data set to train our model.

We will try using feature scaling to see if our results improve by any means.

## Accuraccy of Model - 0.87817

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [35]:
# Loading data
data = np.genfromtxt('boston_x_y_train.csv', delimiter=',')

# Splitting into X_train and Y_train
X_train = data[:, 0:13]
Y_train = data[:, 13]


# we will try to convert the training data into degree 2 to calculate a more complex boundary
columns = X_train.shape[1]

# cross pairs
for j in range(columns):
    for k in range(j + 1, columns):
        new_column = (X_train[:, j] * X_train[:, k]).reshape(-1, 1)
        X_train = np.append(X_train, new_column, axis=1)

# pair with self
for i in range(columns):
    new_column = (X_train[:, i] * X_train[:, i]).reshape(-1,1)
    X_train = np.append(X_train, new_column, axis=1)
    
# adding a row of ones to training data for gradient descent implementation
ones = np.ones(X_train.shape[0]).reshape(-1, 1)
X_train = np.append(X_train, ones, axis=1)

X_test = np.genfromtxt('boston_x_test.csv', delimiter=',')

#testing data - degree 2
columns = X_test.shape[1]

# cross pairs
for j in range(columns):
    for k in range(j + 1, columns):
        new_column = (X_test[:, j] * X_test[:, k]).reshape(-1, 1)
        X_test = np.append(X_test, new_column, axis=1)

# pair with self
for i in range(columns):
    new_column = (X_test[:, i] * X_test[:, i]).reshape(-1,1)
    X_test = np.append(X_test, new_column, axis=1)

test_ones = np.ones(X_test.shape[0]).reshape(-1, 1)
X_test = np.append(X_test, test_ones, axis=1)


data.shape, X_train.shape, Y_train.shape, X_test.shape

((379, 14), (379, 105), (379,), (127, 105))

In [22]:
# Creating a data frame
df = pd.DataFrame(X_train)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
0,-0.40785,-0.487722,-1.266023,-0.272599,-0.576134,1.239974,0.840122,-0.520264,-0.752922,-1.278354,...,0.33193,1.537535,0.705805,0.270675,0.566892,1.63419,0.091866,0.168569,1.205582,1.0
1,-0.407374,-0.487722,0.247057,-0.272599,-1.016689,0.001946,-0.838337,0.336351,-0.523001,-0.060801,...,1.033656,4e-06,0.702809,0.113132,0.273531,0.003697,0.012776,0.084779,0.270893,1.0
2,0.125179,-0.487722,1.015999,-0.272599,1.36749,-0.439699,0.687212,-0.577309,1.661245,1.530926,...,1.87003,0.193335,0.47226,0.333285,2.759736,2.343736,0.650565,14.408063,0.794016,1.0
3,0.028304,-0.487722,1.015999,-0.272599,1.859875,-0.047918,0.801005,-0.712836,1.661245,1.530926,...,3.459136,0.002296,0.64161,0.508136,2.759736,2.343736,0.650565,0.004363,0.046414,1.0
4,-0.412408,-0.487722,-0.969827,-0.272599,-0.913029,-0.384137,-0.834781,0.300508,-0.752922,-0.957633,...,0.833622,0.147561,0.696859,0.090305,0.566892,0.917061,0.000423,0.185825,0.000841,1.0


In [23]:
# Describing our data
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
count,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,...,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0
mean,0.019628,0.002455,0.03617,0.028955,0.028775,0.032202,0.038395,-0.001288,0.043307,0.043786,...,0.997504,1.000742,0.969551,1.053594,1.031945,1.039519,0.998321,1.02937,1.02861,1.0
std,1.06749,1.000813,1.017497,1.048995,0.999656,1.001174,0.985209,1.027803,1.016265,1.019974,...,1.396079,1.93539,1.008492,1.658692,1.08795,0.946287,1.318942,3.07705,1.508958,0.0
min,-0.417713,-0.487722,-1.516987,-0.272599,-1.465882,-3.880249,-2.335437,-1.267069,-0.982843,-1.31399,...,0.001645,4e-06,2e-05,4.799164e-07,0.031727,0.000269,0.000423,1.2e-05,3e-06,1.0
25%,-0.408171,-0.487722,-0.867691,-0.272599,-0.878475,-0.57148,-0.768994,-0.829872,-0.637962,-0.755697,...,0.159407,0.069469,0.279809,0.1932673,0.273531,0.264064,0.118483,0.100486,0.134259,1.0
50%,-0.383729,-0.487722,-0.180458,-0.272599,-0.144217,-0.103479,0.338718,-0.329213,-0.523001,-0.440915,...,0.635784,0.319303,0.779327,0.6304826,0.406995,0.616844,0.650565,0.164186,0.572306,1.0
75%,0.055208,0.156071,1.015999,-0.272599,0.628913,0.529069,0.911243,0.674172,1.661245,1.530926,...,1.427365,0.947824,1.246801,1.161516,2.759736,2.343736,1.283215,0.194527,1.246537,1.0
max,9.941735,3.804234,2.422565,3.668398,2.732346,3.555044,1.117494,3.960518,1.661245,1.798194,...,7.465717,15.056335,5.454266,15.6857,2.759736,3.233502,7.329902,15.078246,11.628092,1.0


### Gradient Descent For N Features

In [24]:
def step_gradient(X_train, Y_train, learning_rate, m,j):
    # Calculate new slope for jth feature
    m_j = 0
    n_data_pts = X_train.shape[0]
    N = len(m)
    for i in range(n_data_pts):
        # calculate the formula m1xi(1)+m2xi(2)+...
        x_i = X_train[i, :]
        y_i = Y_train[i]
        temp_sum = 0
        for k in range(N):
            temp_sum += m[k]*x_i[k]
        ### sub y_i from temp sum
        temp_sum = y_i - temp_sum
        ## complete formula
        m_j += (-2/n_data_pts) * (temp_sum) * x_i[j]
    # update m[j] and return
    m[j] = m[j] - (learning_rate*m_j)
    return m[j]

In [26]:
def gradient_descent(X_train, Y_train, learning_rate, num_iterations):
    # Start with random values for all m's
    m = [0]*(X_train.shape[1])
    m[-1] = 1 #c
    N = len(m)
    for i in range(num_iterations):
        # For all iterations do the following
        for j in range(N):
            m[j] = step_gradient(X_train, Y_train, learning_rate, m,j)
        print("Cost - : ", i, cost(X_train, Y_train, m))
    return m

In [27]:
def cost(X_train, Y_train, m):
    # This will calculate mean square error
    cost = 0
    n_data_pts = len(X_train)
    N = len(m)
    for i in range(n_data_pts):
        x_i = X_train[i, :]
        y_i = Y_train[i]
        temp_sum = 0
        for k in range(N):
            temp_sum += m[k]*x_i[k]
        temp_sum = y_i - temp_sum
        cost += (1/n_data_pts) * ((temp_sum)**2)
    return cost

In [28]:
def predict(X_test, m):
    n_fts = X_train.shape[1]
    n_m = np.array(m).reshape(n_fts, 1)
    return np.dot(X_test, n_m)

In [38]:
def run():
    # Intialize parameters for gradient descent
    num_iterations = 1200
    learning_rate = 0.01
    m = gradient_descent(X_train, Y_train, learning_rate, num_iterations)
    print(m)
    Y_pred = predict(X_test, m)
    print(Y_pred)
    np.savetxt('pred.csv', Y_pred, fmt='%.5f', delimiter=',')

In [39]:
run()

Cost - :  0 291.0192669088228
Cost - :  1 222.42682225085792
Cost - :  2 190.69949804227147
Cost - :  3 169.13343572481884
Cost - :  4 152.15372956578238
Cost - :  5 138.14888146234617
Cost - :  6 126.38062860072276
Cost - :  7 116.38247384336951
Cost - :  8 107.81667182268492
Cost - :  9 100.42531755819817
Cost - :  10 94.00624115664722
Cost - :  11 88.39830830096983
Cost - :  12 83.47152782932774
Cost - :  13 79.12004206279842
Cost - :  14 75.25698885051709
Cost - :  15 71.81063927469063
Cost - :  16 68.72143676113008
Cost - :  17 65.93969190664662
Cost - :  18 63.42376567072958
Cost - :  19 61.13862327248326
Cost - :  20 59.05467382502524
Cost - :  21 57.14683296464836
Cost - :  22 55.39376128513524
Cost - :  23 53.77724254494079
Cost - :  24 52.28167378830379
Cost - :  25 50.89364561507293
Cost - :  26 49.60159544506257
Cost - :  27 48.39552015582985
Cost - :  28 47.266737208905596
Cost - :  29 46.20768551789013
Cost - :  30 45.21175899621133
Cost - :  31 44.273167058145845
Cost - 

Cost - :  255 15.015379761922594
Cost - :  256 14.989647459424502
Cost - :  257 14.964051462100334
Cost - :  258 14.938590565879297
Cost - :  259 14.913263583881847
Cost - :  260 14.888069346065487
Cost - :  261 14.863006698879994
Cost - :  262 14.83807450493112
Cost - :  263 14.813271642653199
Cost - :  264 14.788597005989908
Cost - :  265 14.764049504083381
Cost - :  266 14.739628060970718
Cost - :  267 14.715331615288639
Cost - :  268 14.691159119985286
Cost - :  269 14.667109542039203
Cost - :  270 14.643181862185319
Cost - :  271 14.619375074647948
Cost - :  272 14.595688186879995
Cost - :  273 14.572120219308925
Cost - :  274 14.54867020508871
Cost - :  275 14.525337189857973
Cost - :  276 14.502120231503925
Cost - :  277 14.479018399932073
Cost - :  278 14.45603077684143
Cost - :  279 14.433156455505165
Cost - :  280 14.410394540556446
Cost - :  281 14.3877441477794
Cost - :  282 14.365204403905103
Cost - :  283 14.342774446412276
Cost - :  284 14.320453423332776
Cost - :  285 1

Cost - :  505 11.111373771698595
Cost - :  506 11.101952485121974
Cost - :  507 11.092561446856479
Cost - :  508 11.083200518418602
Cost - :  509 11.07386956215965
Cost - :  510 11.064568441259318
Cost - :  511 11.055297019719351
Cost - :  512 11.046055162357419
Cost - :  513 11.036842734800782
Cost - :  514 11.02765960348033
Cost - :  515 11.018505635624436
Cost - :  516 11.009380699252866
Cost - :  517 11.000284663171009
Cost - :  518 10.99121739696384
Cost - :  519 10.982178770990128
Cost - :  520 10.973168656376616
Cost - :  521 10.96418692501233
Cost - :  522 10.955233449542886
Cost - :  523 10.946308103364862
Cost - :  524 10.937410760620175
Cost - :  525 10.92854129619062
Cost - :  526 10.919699585692364
Cost - :  527 10.910885505470526
Cost - :  528 10.90209893259378
Cost - :  529 10.89333974484907
Cost - :  530 10.884607820736305
Cost - :  531 10.875903039463104
Cost - :  532 10.867225280939655
Cost - :  533 10.85857442577361
Cost - :  534 10.849950355264841
Cost - :  535 10.8

Cost - :  759 9.419053185839825
Cost - :  760 9.414385616513767
Cost - :  761 9.409729212839773
Cost - :  762 9.405083936997737
Cost - :  763 9.400449751332374
Cost - :  764 9.395826618352364
Cost - :  765 9.391214500729397
Cost - :  766 9.38661336129743
Cost - :  767 9.38202316305159
Cost - :  768 9.37744386914756
Cost - :  769 9.37287544290052
Cost - :  770 9.368317847784404
Cost - :  771 9.363771047430983
Cost - :  772 9.359235005629046
Cost - :  773 9.354709686323567
Cost - :  774 9.350195053614865
Cost - :  775 9.345691071757745
Cost - :  776 9.341197705160727
Cost - :  777 9.336714918385168
Cost - :  778 9.332242676144473
Cost - :  779 9.32778094330335
Cost - :  780 9.323329684876866
Cost - :  781 9.318888866029772
Cost - :  782 9.314458452075673
Cost - :  783 9.310038408476213
Cost - :  784 9.305628700840307
Cost - :  785 9.301229294923415
Cost - :  786 9.296840156626661
Cost - :  787 9.292461251996178
Cost - :  788 9.288092547222236
Cost - :  789 9.28373400863858
Cost - :  790 

Cost - :  1016 8.504023483200685
Cost - :  1017 8.50132349744294
Cost - :  1018 8.498628629268126
Cost - :  1019 8.495938864729489
Cost - :  1020 8.493254189928741
Cost - :  1021 8.490574591015847
Cost - :  1022 8.487900054188783
Cost - :  1023 8.485230565693369
Cost - :  1024 8.482566111823086
Cost - :  1025 8.479906678918812
Cost - :  1026 8.477252253368727
Cost - :  1027 8.474602821608016
Cost - :  1028 8.47195837011868
Cost - :  1029 8.469318885429441
Cost - :  1030 8.466684354115424
Cost - :  1031 8.46405476279803
Cost - :  1032 8.461430098144728
Cost - :  1033 8.458810346868873
Cost - :  1034 8.456195495729487
Cost - :  1035 8.453585531531097
Cost - :  1036 8.450980441123534
Cost - :  1037 8.44838021140178
Cost - :  1038 8.445784829305682
Cost - :  1039 8.443194281819903
Cost - :  1040 8.440608555973593
Cost - :  1041 8.438027638840351
Cost - :  1042 8.435451517537926
Cost - :  1043 8.432880179228087
Cost - :  1044 8.430313611116407
Cost - :  1045 8.427751800452137
Cost - :  1046