In [24]:
import pandas as pd
import numpy as np
import sys

In [25]:
def get_data(column_names):
    train_dataframe = pd.read_csv('./data/housing_train.txt', delim_whitespace=True, header = None)
    test_dataframe = pd.read_csv('./data/housing_test.txt', delim_whitespace=True, header = None)
    train_dataframe.columns = column_names
    test_dataframe.columns = column_names
        
    return train_dataframe, test_dataframe 

In [26]:
column_names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']
train_data, test_data = get_data(column_names)

In [27]:
def normalize(dataset, train_len):
    # normalize data using shift/scale
    maxs = dataset.max()
    mins = dataset.min()
    
    for feature in dataset.columns[:-1]:        
        for i, entry in dataset.iterrows():
            dataset.at[i, feature] = (entry[feature] - mins[feature]) / (maxs[feature] - mins[feature])
            
    return dataset.iloc[:train_len], dataset.iloc[train_len:]

In [28]:
full_data = pd.concat([train_data, test_data])
train, test = normalize(full_data, len(train_data))

In [117]:
def get_thresholds(dataset):

    ts = []
    for feature in dataset.columns[:-1]:
        t = []
        for entry in range(len(dataset) - 1):
            t.append((dataset.iloc[entry, feature] + dataset.iloc[entry+1, feature]) / 2)
        ts.append(t)

    return ts

In [118]:
def get_best_split(dataset, thresholds):
    
    best_feature = 'CRIM'
    best_threshold = 0
    max_info_gain = 0
    
    mse_before = get_mse(dataset)
    
    for feature in dataset.columns[:-1]:
        for threshold in thresholds[feature]:
            left, right = split_data(dataset, feature, threshold)
            
            if len(left) == 0 or len(right) == 0:
                continue
                
            left_mse = get_mse(left)
            right_mse = get_mse(right)
            
            w = len(left) / len(dataset)
            mse_after = (w * left_mse) + ((1-w) * right_mse)
            
            info_gain = mse_before - (left_mse + right_mse)

            if max_info_gain <= info_gain:
                max_info_gain = mse_after
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold, max_info_gain

In [119]:
class Terminal:

    def __init__(self, dataset):
        self.prediction = dataset['MEDV'].mean()

    def predict(self):
        return self.prediction

In [120]:
class Node:

    def __init__(self, feature, threshold, left_node, right_node):
        self.feature = feature
        self.threshold = threshold
        self.left_node = left_node
        self.right_node = right_node

In [121]:
def get_mse(dataset):
    
    prediction = dataset['MEDV'].mean()
    mse = 0

    for i,entry in dataset.iterrows():
        mse += np.square(entry['MEDV'] - prediction)
        
    return mse

In [122]:
def split_data(dataset, feature, threshold):    
    print(feature)
    left = dataset[dataset[feature] < threshold]
    right = dataset[dataset[feature] >= threshold]
    
    return left, right

In [123]:
def build_tree(dataset, depth):

    thresholds = get_thresholds(dataset)    
    best_feature, best_threshold, info_gain = get_best_split(dataset, thresholds)
    
    if info_gain == 0 or depth > 20:
        return Terminal(dataset)
    
    left_data, right_data = split_data(dataset, best_feature, best_threshold)

    left_node = build_tree(left_data, depth+1)
    right_node = build_tree(right_data, depth+1)

    return Node(best_feature, best_threshold, left_node, right_node)

In [124]:
## CREATE REGRESSION TREE

model = build_tree(train, 0)

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

In [None]:
def regress(root, entry):
    
    if isinstance(root, Terminal):
        return root.predict()
    
    if entry[root.feature] < root.threshold:
        result = regress(root.left_node, entry)
    else:
        result = regress(root.right_node, entry)
        
    return result

In [None]:
def test_model(model, test_data):
    
    predictions = []
    
    for entry in test_data:
        predictions.append(regress(model, entry))

    mse = 0
    
    for i,p in enumerate(predictions):
        mse += np.square(test_data[i][-1] - p)
        
    return mse

In [None]:
final_mse = test_model(model, test)
print(final_mse)

In [91]:
test[test.MEDV == 10]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV


In [125]:
t = get_thresholds(test)
t

ValueError: Location based indexing can only have [integer, integer slice (START point is INCLUDED, END point is EXCLUDED), listlike of integers, boolean array] types

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.009376,0.0,0.281525,0,0.314815,0.390496,0.852729,0.302358,0,0.229008,0.893617,0.764285,0.407837,13.9
1,0.007481,0.0,0.281525,0,0.314815,0.4315,0.900103,0.323036,0,0.229008,0.893617,0.949518,0.360927,16.6
2,0.010672,0.0,0.281525,0,0.314815,0.476336,0.884655,0.302249,0,0.229008,0.893617,0.771748,0.429084,14.8
3,0.008617,0.0,0.281525,0,0.314815,0.562177,0.942327,0.302367,0,0.229008,0.893617,0.977407,0.305464,18.4
5,0.002506,0.0,0.236437,0,0.12963,0.473079,0.850669,0.414644,0,0.087786,0.56383,0.98951,0.471026,16.6
6,0.002782,0.0,0.236437,0,0.12963,0.352175,0.951596,0.431067,0,0.087786,0.56383,1.0,0.802428,14.4
7,0.002399,0.0,0.236437,0,0.12963,0.391071,0.608651,0.450863,0,0.087786,0.56383,1.0,0.399283,19.4
15,0.0025,0.0,0.296921,0,0.277778,0.544932,0.84964,0.144141,0,0.375954,0.882979,0.17772,0.245585,18.6
16,0.002307,0.0,0.296921,0,0.277778,0.493581,0.870237,0.144141,0,0.375954,0.882979,0.993873,0.323124,19.3
18,0.003636,0.0,0.785557,0,0.49177,0.433225,0.952626,0.12188,0,0.477099,0.914894,0.979298,0.366998,18.4
