In [55]:
import numpy as np
import pandas as pd
from numpy.linalg import inv

In [56]:
def get_data(column_names):
    train_df = pd.read_csv('./data/housing_train.txt', delim_whitespace=True, header = None)
    test_df = pd.read_csv('./data/housing_test.txt', delim_whitespace=True, header = None)
    test_df.columns = column_names
    train_df.columns = column_names
    return train_df, test_df

In [57]:
column_names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']
train_data, test_data = get_data(column_names)

In [58]:
train_data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,433.0,433.0,433.0,433.0,433.0,433.0,433.0,433.0,433.0,433.0,433.0,433.0,433.0,433.0
mean,3.473223,11.553118,10.762125,0.066975,0.552842,6.301021,67.842032,3.800756,9.422633,398.554273,18.384758,362.30545,12.399607,22.867206
std,8.537611,23.730903,6.658463,0.250267,0.115408,0.71228,27.796861,2.055903,8.686897,167.475074,2.128271,84.56723,7.089464,9.497501
min,0.00632,0.0,0.46,0.0,0.392,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.07151,0.0,5.13,0.0,0.453,5.888,44.4,2.1069,4.0,277.0,17.0,376.75,6.75,17.2
50%,0.22969,0.0,8.56,0.0,0.532,6.209,74.8,3.3317,5.0,311.0,18.7,392.33,10.58,21.5
75%,3.56868,12.5,18.1,0.0,0.624,6.629,93.3,5.2146,24.0,666.0,20.2,396.9,16.42,25.1
max,88.9762,100.0,25.65,1.0,0.871,8.78,100.0,12.1265,24.0,666.0,21.2,396.9,36.98,50.0


In [59]:
test_data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0,74.0
mean,4.388865,10.101351,12.562432,0.081081,0.564095,6.212865,70.805405,3.777096,10.310811,451.594595,18.866216,325.987568,13.707162,21.171622
std,8.93874,20.739906,7.012223,0.274823,0.117873,0.617129,30.071777,2.36198,8.761569,155.910311,2.326281,119.305464,6.942792,6.505351
min,0.01096,0.0,1.76,0.0,0.385,4.138,6.8,1.137,1.0,233.0,13.0,2.6,2.87,10.2
25%,0.129418,0.0,6.2,0.0,0.437,5.8755,43.4,2.1101,4.0,330.0,17.9,321.0175,9.4625,16.75
50%,0.548945,0.0,12.83,0.0,0.584,6.2055,84.1,2.75025,5.0,398.0,20.15,386.68,13.1,20.1
75%,3.83102,0.0,18.1,0.0,0.624,6.54075,95.375,4.653175,24.0,666.0,20.2,394.62,17.2475,24.25
max,51.1358,80.0,27.74,1.0,0.871,8.259,100.0,10.5857,24.0,711.0,22.0,396.9,37.97,42.8


In [60]:
def normalize(dataset):
    
    maxs = dataset.max()
    mins = dataset.min()
    
    for feature in dataset.columns[:-1]:        
        for i, entry in dataset.iterrows():
            dataset.at[i, feature] = (entry[feature] - mins[feature]) / (maxs[feature] - mins[feature])
            
    return dataset

In [61]:
# train_data = normalize(train_data)
# test_data = normalize(test_data)

In [62]:
def get_weights(train_data):
    x = train_data.drop(['MEDV'], axis = 1).values
    y = train_data['MEDV'].values

    bias = np.ones(len(train_data))
    x = np.append(np.ones([len(x),1]),x,1)
 
    w = np.dot(np.dot(inv(np.dot(x.T, x)), x.T), y)
    return w

In [63]:
weights = get_weights(train_data)

In [64]:
def predict(test_data, weights):
    test_data = test_data.drop(['MEDV'], axis = 1).values
    
    bias = np.ones(len(test_data))
    test_data = np.append(np.ones([len(test_data),1]),test_data,1)
 
    preds = {}
    
    for i in range(len(test_data)):
        preds[i] = np.dot(weights, test_data[i])
        
    return preds

In [65]:
def get_mse(test_data, preds):
    test_labels = test_data['MEDV'].values
    errors = []

    for i, label in enumerate(test_labels):
        errors.append(np.square(label - preds[i]))
    
    mse = pd.Series(errors).mean()
    return mse

In [66]:
preds = predict(test_data, weights)
print('MSE for SpamBase: {}'.format(get_mse(test_data, preds)))

MSE for SpamBase: 22.638256296598747
