In [1]:
import numpy as np
import pandas as pd
from numpy.linalg import pinv
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

In [2]:
word_labels = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "label"]
train_df_pre = pd.read_csv("HousingData/housing_train.txt", delim_whitespace=True, names = word_labels, header=None) 
test_df_pre = pd.read_csv("HousingData/housing_test.txt", delim_whitespace=True, names = word_labels, header=None) 

In [3]:
df = train_df_pre.append(test_df_pre)
df.shape

(507, 14)

In [4]:
df_norm = df.iloc[:, :-1]
df_norm = (df_norm - df_norm.mean()) / df_norm.std()
df = df_norm.join(df.iloc[:, -1])
df.iloc[432]

CRIM       -0.397417
ZN          0.457363
INDUS      -0.766959
CHAS       -0.272041
NOX        -1.067087
RM         -0.976870
AGE         0.068451
DIS         1.979015
RAD        -0.293540
TAX        -0.457557
PTRATIO     0.298240
B           0.352114
LSTAT       0.829434
label      18.500000
Name: 210, dtype: float64

In [5]:
train_df = df.iloc[0:433]
test_df = df.iloc[434:507]

In [6]:
train_df.shape
test_df_pre.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,label
0,0.84054,0.0,8.14,0,0.538,5.599,85.7,4.4546,4,307.0,21.0,303.42,16.51,13.9
1,0.67191,0.0,8.14,0,0.538,5.813,90.3,4.682,4,307.0,21.0,376.88,14.81,16.6
2,0.95577,0.0,8.14,0,0.538,6.047,88.8,4.4534,4,307.0,21.0,306.38,17.28,14.8
3,0.77299,0.0,8.14,0,0.538,6.495,94.4,4.4547,4,307.0,21.0,387.94,12.8,18.4
4,1.00245,0.0,8.14,0,0.538,6.674,87.3,4.239,4,307.0,21.0,380.23,11.98,21.0


In [7]:
test_df.shape

(73, 14)

In [8]:
train_df.shape

(433, 14)

In [9]:
theta_len = len(train_df.columns)
theta_len

14

In [10]:
#Grab the relevant data, scale the predictor variable, and add a column of 1s for the gradient descent
x = train_df.iloc[:, :-1]
y = train_df.iloc[:, -1]

# x = (x - x.mean()) / x.std()
x = np.c_[np.ones(x.shape[0]), x] 

In [11]:
#GRADIENT DESCENT

alpha = 0.001 #Step size
iterations = 5000 #No. of iterations
m = y.size #No. of data points
np.random.seed(123) #Set the seed
theta = np.random.rand(theta_len) #Pick some random values to start with


#GRADIENT DESCENT
def gradient_descent(x, y, theta, iterations, alpha):
    past_costs = []
    past_thetas = [theta]
    for i in range(iterations):
        hypothesis = np.dot(x, theta)
        error = hypothesis - y
        cost = 1/(2*m) * np.dot(error.T, error)
        past_costs.append(cost)
        theta = theta - (alpha * (1/m) * np.dot(x.T, error))
        past_thetas.append(theta)
        
    return past_thetas, past_costs

#Pass the relevant variables to the function and get the new values back...
past_thetas, past_costs = gradient_descent(x, y, theta, iterations, alpha)
theta = past_thetas[-1]



In [12]:
theta

array([ 2.21020195e+01, -2.65438189e-03, -1.67378961e-02, -3.79560114e-01,
        7.73885231e-01, -7.70704088e-01,  4.18737020e+00, -1.29678509e-01,
       -1.02398678e+00, -8.04743448e-01, -8.35156285e-01, -5.71560495e-01,
       -6.87645656e-01, -9.11336809e-01])

In [13]:
y_pred = theta.T.dot(x.T)

In [14]:
y_pred = y_pred.T 
mse = np.mean((y - y_pred)**2)
mse

32.9325226708364