In [525]:
import pandas as pd
import numpy as np
import sys

In [564]:
def get_data(column_names):
    train_dataframe = pd.read_csv('./data/housing_train.txt', delim_whitespace=True, header = None)
    test_dataframe = pd.read_csv('./data/housing_test.txt', delim_whitespace=True, header = None)
    train_dataframe.columns = column_names
    test_dataframe.columns = column_names
        
    return train_dataframe, test_dataframe 

In [565]:
column_names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']
train_data, test_data = get_data(column_names)

In [566]:
def normalize(dataset, train_len):
    
    # normalize data using shift/scale
    maxs = dataset.max()
    mins = dataset.min()
    
    for feature in dataset.columns[:-1]:        
        for i, entry in dataset.iterrows():
            dataset.at[i, feature] = (entry[feature] - mins[feature]) / (maxs[feature] - mins[feature])
            
    return dataset.iloc[:train_len], dataset.iloc[train_len:]

In [567]:
full_data = pd.concat([train_data, test_data])
train, test = normalize(full_data, len(train_data))

In [568]:
def get_thresholds(dataset, feature):

    ts = []

    for entry in range(len(dataset) - 1):
        ts.append((dataset.iloc[entry][feature] + dataset.iloc[entry+1][feature]) / 2)

    return ts

In [569]:
def get_best_split(dataset):
    
    max_info_gain = 0
    best_feature = None
    best_threshold = None
    mse_before = get_mse(dataset)
    
    for feature in dataset.columns[:-1]:
    
        thresholds = get_thresholds(dataset, feature)
    
        for threshold in thresholds:
            left, right = split_data(dataset, feature, threshold)
            
            if len(left) == 0 or len(right) == 0:
                continue
                
            left_mse = get_mse(left)
            right_mse = get_mse(right)
            
            w = len(left) / len(dataset)
            mse_after = (w * left_mse) + ((1-w) * right_mse)
            
            info_gain = mse_before - mse_after

            if max_info_gain <= info_gain:
                max_info_gain = info_gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold, max_info_gain

In [570]:
class Terminal:

    def __init__(self, dataset):
        self.prediction = dataset['MEDV'].mean()

    def predict(self):
        return self.prediction

In [571]:
class Node:

    def __init__(self, feature, threshold, left_node, right_node):
        self.feature = feature
        self.threshold = threshold
        self.left_node = left_node
        self.right_node = right_node

In [572]:
def get_mse(dataset):
    
    prediction = dataset['MEDV'].mean()
    errors = []

    for i,entry in dataset.iterrows():
        errors.append(np.square(entry['MEDV'] - prediction))
    
    mse = pd.Series(errors).mean()
    return mse

In [573]:
def split_data(dataset, feature, threshold):    
    
    left = dataset[dataset[feature] < threshold]
    right = dataset[dataset[feature] >= threshold]
    
    return left, right

In [588]:
def build_tree(dataset, depth):

    best_feature, best_threshold, info_gain = get_best_split(dataset)
    
    if info_gain == 0 or depth >=4:
        return Terminal(dataset)
    
    print(best_feature, best_threshold, info_gain)
    
    left_data, right_data = split_data(dataset, best_feature, best_threshold)

    left_node = build_tree(left_data, depth+1)
    right_node = build_tree(right_data, depth+1)

    return Node(best_feature, best_threshold, left_node, right_node)

In [589]:
## CREATE REGRESSION TREE
model = build_tree(train, 0)

RM 0.7344318835025868 34.32118719415788
LSTAT 0.22723509933774835 16.982560209851442
DIS 0.031213341941819975 19.654581386992465
RM 0.6203295650507761 7.705330887868687
NOX 0.5781893004115226 10.210471282494826
RM 0.5615060356390114 3.1983879586604615
LSTAT 0.4732339955849889 8.978634903733223
PTRATIO 0.574468085106383 32.86988647715917
RM 0.8091588426901706 5.514242178133589
RM 0.7419045794213451 15.992654320987654
PTRATIO 0.276595744680851 3.538641082101286
TAX 0.2213740458015267 53.9566530612245
LSTAT 0.0869205298013245 12.848066666666663
LSTAT 0.07408940397350994 1.6899999999999973


In [590]:
def depth(root, num_nodes):
    if isinstance(root, Terminal):
        return 0
    
    num_nodes[0] += 1
    return 1+max(depth(root.left_node, num_nodes), depth(root.right_node, num_nodes))

In [591]:
num = [0] 
x = depth(model, num)
print(num, x)

[14] 4


In [592]:
def regress(root, entry):
    
    if isinstance(root, Terminal):
        return root.predict()
    
    if entry[root.feature] < root.threshold:
        result = regress(root.left_node, entry)
    else:
        result = regress(root.right_node, entry)
        
    return result

In [593]:
def test_model(model, test_data):
    
    predictions = []
    
    for i, entry in test_data.iterrows():
        predictions.append(regress(model, entry))
    
    errors = []
    
    for i,p in enumerate(predictions):
        errors.append(np.square(test_data.iloc[i]['MEDV'] - p))
    
    mse = pd.Series(errors).mean()
    return mse, predictions

In [594]:
final_mse, preds = test_model(model, test)
print('MSE: {}'.format(final_mse))

MSE: 23.243058603348036
