In [16]:
import pandas as pd
import numpy as np

In [17]:
def get_data(column_names):
    train_dataframe = pd.read_csv('./data/housing_train.txt', delim_whitespace=True, header = None)
    test_dataframe = pd.read_csv('./data/housing_test.txt', delim_whitespace=True, header = None)
    train_dataframe.columns = column_names
    test_dataframe.columns = column_names
        
    return train_dataframe, test_dataframe 

In [18]:
column_names = ['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT','MEDV']
train_data, test_data = get_data(column_names)

In [19]:
def normalize(dataset, train_len):
    
    # normalize data using shift/scale
    maxs = dataset.max()
    mins = dataset.min()
    
    for feature in dataset.columns[:-1]:        
        for i, entry in dataset.iterrows():
            dataset.at[i, feature] = (entry[feature] - mins[feature]) / (maxs[feature] - mins[feature])
            
    return dataset.iloc[:train_len], dataset.iloc[train_len:]

In [20]:
full_data = pd.concat([train_data, test_data])
train, test = normalize(full_data, len(train_data))

In [21]:
def get_feature_thresholds(dataset, feature):
    dataset.sort_values(by=[feature])
    ts = []

    for entry in range(len(dataset) - 1):
        ts.append((dataset.iloc[entry][feature] + dataset.iloc[entry+1][feature]) / 2)
        
    return ts

In [22]:
def get_thresholds(dataset):
    
    thresholds = {}
    
    for feature in dataset.columns[:-1]:
        thres = get_feature_thresholds(dataset, feature)
        thresholds[feature] = thres
        
    return thresholds

In [23]:
def get_best_split(dataset, all_thresholds):
    
    max_info_gain = 0
    best_feature = None
    best_threshold = None
    mse_before = get_mse(dataset)
    
    for feature in dataset.columns[:-1]:
    
        thresholds = all_thresholds[feature]
    
        for threshold in thresholds:
            left, right = split_data(dataset, feature, threshold)
            
            if len(left) == 0 or len(right) == 0:
                continue
                
            left_mse = get_mse(left)
            right_mse = get_mse(right)
            
            w = len(left) / len(dataset)
            mse_after = (w * left_mse) + ((1-w) * right_mse)
            
            info_gain = mse_before - mse_after

            if max_info_gain <= info_gain:
                max_info_gain = info_gain
                best_feature = feature
                best_threshold = threshold

    return best_feature, best_threshold, max_info_gain

In [24]:
class Terminal:

    def __init__(self, dataset):
        self.prediction = dataset['MEDV'].mean()

    def predict(self):
        return self.prediction

In [25]:
class Node:

    def __init__(self, feature, threshold, left_node, right_node):
        self.feature = feature
        self.threshold = threshold
        self.left_node = left_node
        self.right_node = right_node

In [26]:
def get_mse(dataset):
    
    prediction = dataset['MEDV'].mean()
    errors = []

    for i,entry in dataset.iterrows():
        errors.append(np.square(entry['MEDV'] - prediction))
    
    mse = pd.Series(errors).mean()
    return mse

In [27]:
def split_data(dataset, feature, threshold):    
    
    left = dataset[dataset[feature] < threshold]
    right = dataset[dataset[feature] >= threshold]
    
    return left, right

In [28]:
def build_tree(dataset, thresholds, depth, max_depth = 4):

    best_feature, best_threshold, info_gain = get_best_split(dataset, thresholds)
    
    if info_gain == 0 or depth >=max_depth:
        return Terminal(dataset)
    
    print('Split Selected: (Feature: {}, Threshold: {}, Info Gain: {})'.format(best_feature, best_threshold, info_gain))
    
    left_data, right_data = split_data(dataset, best_feature, best_threshold)

    left_node = build_tree(left_data, thresholds, depth+1)
    right_node = build_tree(right_data, thresholds, depth+1)

    return Node(best_feature, best_threshold, left_node, right_node)

In [29]:
# get thresholds for each feature
thresholds = get_thresholds(train)

In [30]:
## CREATE REGRESSION TREE    
model = build_tree(train, thresholds, 0, max_depth = 4)

Split Selected: (Feature: RM, Threshold: 0.7344318835025868, Info Gain: 34.32118719415788)
Split Selected: (Feature: LSTAT, Threshold: 0.22723509933774833, Info Gain: 16.982560209851442)
Split Selected: (Feature: DIS, Threshold: 0.03881548436377525, Info Gain: 19.654581386992465)
Split Selected: (Feature: RM, Threshold: 0.6203295650507761, Info Gain: 7.705330887868687)
Split Selected: (Feature: NOX, Threshold: 0.5781893004115226, Info Gain: 10.210471282494826)
Split Selected: (Feature: INDUS, Threshold: 0.2969208211143695, Info Gain: 2.888092449317483)
Split Selected: (Feature: LSTAT, Threshold: 0.4732339955849889, Info Gain: 8.978634903733223)
Split Selected: (Feature: PTRATIO, Threshold: 0.6063829787234043, Info Gain: 34.49256483328581)
Split Selected: (Feature: RM, Threshold: 0.7406591301015522, Info Gain: 12.508558814462491)
Split Selected: (Feature: LSTAT, Threshold: 0.11948123620309051, Info Gain: 0.5625000000000027)
Split Selected: (Feature: PTRATIO, Threshold: 0.234042553191489

In [31]:
def depth(root, num_nodes):
    if isinstance(root, Terminal):
        return 0
    
    num_nodes[0] += 1
    return 1+max(depth(root.left_node, num_nodes), depth(root.right_node, num_nodes))

In [32]:
num = [0] 
x = depth(model, num)
print('Number of Nodes: {} \nDepth of Tree: {}'.format(num, x))

Number of Nodes: [14] 
Depth of Tree: 4


In [33]:
def print_tree(root, spaces):
    
    if isinstance(root, Terminal):
        print(spaces + 'prediction: {}'.format(root.predict()))
        return
    
    print(spaces + 'NODE: Feature: {}, Threshold: {}'.format(root.feature, root.threshold))
    
    print(spaces + 'Left: ')
    print_tree(root.left_node, spaces + '  ')
    print(spaces + 'Right: ')
    print_tree(root.right_node, spaces + '  ')

In [34]:
print_tree(model, '')

NODE: Feature: RM, Threshold: 0.7344318835025868
Left: 
  NODE: Feature: LSTAT, Threshold: 0.22723509933774833
  Left: 
    NODE: Feature: DIS, Threshold: 0.03881548436377525
    Left: 
      prediction: 50.0
    Right: 
      NODE: Feature: RM, Threshold: 0.6203295650507761
      Left: 
        prediction: 24.029166666666665
      Right: 
        prediction: 30.06190476190476
  Right: 
    NODE: Feature: NOX, Threshold: 0.5781893004115226
    Left: 
      NODE: Feature: INDUS, Threshold: 0.2969208211143695
      Left: 
        prediction: 23.82571428571429
      Right: 
        prediction: 19.73333333333333
    Right: 
      NODE: Feature: LSTAT, Threshold: 0.4732339955849889
      Left: 
        prediction: 17.22553191489362
      Right: 
        prediction: 11.224999999999998
Right: 
  NODE: Feature: PTRATIO, Threshold: 0.6063829787234043
  Left: 
    NODE: Feature: RM, Threshold: 0.7406591301015522
    Left: 
      NODE: Feature: LSTAT, Threshold: 0.11948123620309051
      Left: 
 

In [35]:
def regress(root, entry):
    
    if isinstance(root, Terminal):
        return root.predict()
    
    if entry[root.feature] < root.threshold:
        result = regress(root.left_node, entry)
    else:
        result = regress(root.right_node, entry)
        
    return result

In [36]:
def test_model(model, test_data):
    
    predictions = []
    
    for i, entry in test_data.iterrows():
        predictions.append(regress(model, entry))
    
    errors = []
    
    for i,p in enumerate(predictions):
        errors.append(np.square(test_data.iloc[i]['MEDV'] - p))
    
    mse = pd.Series(errors).mean()
    return mse, predictions

In [37]:
train_mse, preds = test_model(model, train)
print('Train MSE: {}'.format(train_mse))

Train MSE: 17.716379042582876


In [38]:
final_mse, preds = test_model(model, test)
print('Test MSE: {}'.format(final_mse))

Test MSE: 25.079657412105785
