## Boston House Predictor

In this notebook we will use gradient descent for predicting house prices. We will use sklearn's boston houses data set to train our model.

We will try using feature scaling to see if our results improve by any means.

## Accuracy of Model - 0.87817

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

In [45]:
# Loading data
data = np.genfromtxt('boston_x_y_train.csv', delimiter=',')
# Splitting into X_train and Y_train
X_train = data[:, 0:13]
Y_train = data[:, 13]


# we will try to convert the training data into degree 2 to calculate a more complex boundary
columns = X_train.shape[1]

# cross pairs
for j in range(columns):
    for k in range(j + 1, columns):
        new_column = (X_train[:, j] * X_train[:, k]).reshape(-1, 1)
        X_train = np.append(X_train, new_column, axis=1)

# pair with self
for i in range(columns):
    new_column = (X_train[:, i] * X_train[:, i]).reshape(-1,1)
    X_train = np.append(X_train, new_column, axis=1)
    
# adding a row of ones to training data for gradient descent implementation
ones = np.ones(X_train.shape[0]).reshape(-1, 1)
X_train = np.append(X_train, ones, axis=1)

X_test = np.genfromtxt('boston_x_test.csv', delimiter=',')

#testing data - degree 2
columns = X_test.shape[1]

# cross pairs
for j in range(columns):
    for k in range(j + 1, columns):
        new_column = (X_test[:, j] * X_test[:, k]).reshape(-1, 1)
        X_test = np.append(X_test, new_column, axis=1)

# pair with self
for i in range(columns):
    new_column = (X_test[:, i] * X_test[:, i]).reshape(-1,1)
    X_test = np.append(X_test, new_column, axis=1)

test_ones = np.ones(X_test.shape[0]).reshape(-1, 1)
X_test = np.append(X_test, test_ones, axis=1)


data.shape, X_train.shape, Y_train.shape, X_test.shape

((379, 14), (379, 105), (379,), (127, 105))

In [46]:
# Creating a data frame
df = pd.DataFrame(X_train)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
0,-0.40785,-0.487722,-1.266023,-0.272599,-0.576134,1.239974,0.840122,-0.520264,-0.752922,-1.278354,...,0.33193,1.537535,0.705805,0.270675,0.566892,1.63419,0.091866,0.168569,1.205582,1.0
1,-0.407374,-0.487722,0.247057,-0.272599,-1.016689,0.001946,-0.838337,0.336351,-0.523001,-0.060801,...,1.033656,4e-06,0.702809,0.113132,0.273531,0.003697,0.012776,0.084779,0.270893,1.0
2,0.125179,-0.487722,1.015999,-0.272599,1.36749,-0.439699,0.687212,-0.577309,1.661245,1.530926,...,1.87003,0.193335,0.47226,0.333285,2.759736,2.343736,0.650565,14.408063,0.794016,1.0
3,0.028304,-0.487722,1.015999,-0.272599,1.859875,-0.047918,0.801005,-0.712836,1.661245,1.530926,...,3.459136,0.002296,0.64161,0.508136,2.759736,2.343736,0.650565,0.004363,0.046414,1.0
4,-0.412408,-0.487722,-0.969827,-0.272599,-0.913029,-0.384137,-0.834781,0.300508,-0.752922,-0.957633,...,0.833622,0.147561,0.696859,0.090305,0.566892,0.917061,0.000423,0.185825,0.000841,1.0


In [47]:
# Describing our data
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
count,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,...,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0
mean,0.019628,0.002455,0.03617,0.028955,0.028775,0.032202,0.038395,-0.001288,0.043307,0.043786,...,0.997504,1.000742,0.969551,1.053594,1.031945,1.039519,0.998321,1.02937,1.02861,1.0
std,1.06749,1.000813,1.017497,1.048995,0.999656,1.001174,0.985209,1.027803,1.016265,1.019974,...,1.396079,1.93539,1.008492,1.658692,1.08795,0.946287,1.318942,3.07705,1.508958,0.0
min,-0.417713,-0.487722,-1.516987,-0.272599,-1.465882,-3.880249,-2.335437,-1.267069,-0.982843,-1.31399,...,0.001645,4e-06,2e-05,4.799164e-07,0.031727,0.000269,0.000423,1.2e-05,3e-06,1.0
25%,-0.408171,-0.487722,-0.867691,-0.272599,-0.878475,-0.57148,-0.768994,-0.829872,-0.637962,-0.755697,...,0.159407,0.069469,0.279809,0.1932673,0.273531,0.264064,0.118483,0.100486,0.134259,1.0
50%,-0.383729,-0.487722,-0.180458,-0.272599,-0.144217,-0.103479,0.338718,-0.329213,-0.523001,-0.440915,...,0.635784,0.319303,0.779327,0.6304826,0.406995,0.616844,0.650565,0.164186,0.572306,1.0
75%,0.055208,0.156071,1.015999,-0.272599,0.628913,0.529069,0.911243,0.674172,1.661245,1.530926,...,1.427365,0.947824,1.246801,1.161516,2.759736,2.343736,1.283215,0.194527,1.246537,1.0
max,9.941735,3.804234,2.422565,3.668398,2.732346,3.555044,1.117494,3.960518,1.661245,1.798194,...,7.465717,15.056335,5.454266,15.6857,2.759736,3.233502,7.329902,15.078246,11.628092,1.0


### Gradient Descent For N Features

In [48]:
def step_gradient(X_train, Y_train, learning_rate, m,j):
    # Calculate new slope for jth feature
    m_j = 0
    n_data_pts = X_train.shape[0]
    N = len(m)
    for i in range(n_data_pts):
        # calculate the formula m1xi(1)+m2xi(2)+...
        x_i = X_train[i, :]
        y_i = Y_train[i]
        temp_sum = 0
        for k in range(N):
            temp_sum += m[k]*x_i[k]
        ### sub y_i from temp sum
        temp_sum = y_i - temp_sum
        ## complete formula
        m_j += (-2/n_data_pts) * (temp_sum) * x_i[j]
    # update m[j] and return
    m[j] = m[j] - (learning_rate*m_j)
    return m[j]

In [49]:
def gradient_descent(X_train, Y_train, learning_rate, num_iterations):
    # Start with random values for all m's
    m = [0]*(X_train.shape[1])
    m[-1] = 1 #c
    N = len(m)
    x_data = []
    y_data = []
    fig = plt.figure()
    for i in range(num_iterations):
        # For all iterations do the following
        for j in range(N):
            m[j] = step_gradient(X_train, Y_train, learning_rate, m,j)
        a = cost(X_train, Y_train, m)
        x_data.append(i)
        y_data.append(a)
        plt.plot(x_data,y_data,'*')
        print("Cost - : ", i, a)
        plt.draw()
        plt.pause(1e-17)
        time.sleep(0.1)
    return m

In [50]:
def cost(X_train, Y_train, m):
    # This will calculate mean square error
    cost = 0
    n_data_pts = len(X_train)
    N = len(m)
    for i in range(n_data_pts):
        x_i = X_train[i, :]
        y_i = Y_train[i]
        temp_sum = 0
        for k in range(N):
            temp_sum += m[k]*x_i[k]
        temp_sum = y_i - temp_sum
        cost += (1/n_data_pts) * ((temp_sum)**2)
    return cost

In [51]:
def predict(X_test, m):
    n_fts = X_train.shape[1]
    n_m = np.array(m).reshape(n_fts, 1)
    return np.dot(X_test, n_m)

In [65]:
df_test = pd.DataFrame(X_test)
df_test.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
count,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,...,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0,127.0
mean,-0.058575,-0.007327,-0.107939,-0.08641,-0.085871,-0.096098,-0.114581,0.003845,-0.12924,-0.13067,...,1.007449,0.997785,1.090868,0.8400622,0.904669,0.882064,1.005011,0.912351,0.914621,1.0
std,0.769837,1.005445,0.945672,0.839435,1.003998,0.998196,1.042254,0.920171,0.946051,0.933732,...,1.371094,2.065959,1.036346,1.279148,0.986262,0.858238,1.277402,2.88187,1.762049,0.0
min,-0.417173,-0.487722,-1.557842,-0.272599,-1.431329,-3.058221,-2.225199,-1.263551,-0.982843,-1.308051,...,0.001645,0.000275,6.4e-05,4.799164e-07,0.031727,0.000269,0.000423,0.000205,6e-06,1.0
25%,-0.410832,-0.487722,-0.891036,-0.272599,-0.947582,-0.567918,-1.240171,-0.762417,-0.637962,-0.785394,...,0.125565,0.041906,0.281686,0.1238452,0.273531,0.181269,0.105174,0.130853,0.091965,1.0
50%,-0.398269,-0.487722,-0.375976,-0.272599,-0.299707,-0.127698,0.11113,-0.202052,-0.523001,-0.601276,...,0.811838,0.212584,0.88302,0.5135718,0.406995,0.571079,0.650565,0.175848,0.311722,1.0
75%,-0.2429,-0.219475,1.015999,-0.272599,0.434551,0.283316,0.898797,0.604198,-0.350561,0.072833,...,1.430362,0.820423,1.538341,1.016238,0.8596,1.263543,1.384072,0.194527,1.015787,1.0
max,3.966816,3.589637,2.117615,3.668398,2.732346,3.476688,1.117494,3.2873,1.661245,1.530926,...,7.465717,12.08736,4.951512,10.80634,2.759736,2.343736,7.329902,15.266159,12.593774,1.0


In [None]:
def run():
    # Intialize parameters for gradient descent
    num_iterations = 1200
    learning_rate = 0.01
    m = gradient_descent(X_train, Y_train, learning_rate, num_iterations)
    print(m)
    Y_pred = predict(X_test, m)
    np.savetxt('pred.csv', Y_pred, fmt='%.5f', delimiter=',')

In [None]:
run()

In [None]:
predict(25, m)