# Regression tree - Housing Price database

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

## Dara Preprocessing

- Adding the feature name to the data
- making sure there are no null values in the data

In [2]:
word_labels = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "label"]
train_df = pd.read_csv("HousingData/housing_train.txt", delim_whitespace=True, names = word_labels, header=None) 
test_df = pd.read_csv("HousingData/housing_test.txt", delim_whitespace=True, names = word_labels, header=None) 


In [3]:
# df.head()

In [4]:
# df['label'].replace(0, 'non-spam',inplace=True)
# df['label'].replace(1, 'spam',inplace=True)
# df.head()

# Train-Test-Split

In [5]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    test_indices = random.sample(population=indices, k=test_size)

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    
    return train_df, test_df

In [6]:
random.seed(0)
d_train_df, d_test_df = train_test_split(train_df, test_size=20)

# Helper Functions

In [7]:
data = train_df.values
data[:5]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00, 2.4000e+01],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00, 2.1600e+01],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9283e+02, 4.0300e+00, 3.4700e+01],
       [3.2370e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        6.9980e+00, 4.5800e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9463e+02, 2.9400e+00, 3.3400e+01],
       [6.9050e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        7.1470e+00, 5.4200e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9690e+02, 5.3300e+00, 3.6200e+01]])

### Check data purity 

In [8]:
def check_purity(data):
    
    label_column = data[:, -1]
    variance = np.var(label_column)

    if variance == 0:
        return True
    else:
        return False

### Classify

In [9]:
def classify_data(data):
    
    label_column = data[:, -1]
    mean_data = np.mean(label_column)
    classification = mean_data
    
    return classification

### Potential splits

In [10]:
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):        # excluding the last column which is the label
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2
                
                potential_splits[column_index].append(potential_split)
    
    return potential_splits

### Split Data

In [11]:
def split_data(data, split_feature, threshold):
    
    split_column_values = data[:, split_feature]

    data_left = data[split_column_values <= threshold]
    data_right = data[split_column_values >  threshold]
    
    return data_left, data_right

### Lowest total variance

In [12]:
def calculate_variance(data):
    
    label_column = data[:, -1]
    variance = np.var(label_column)
     
    return variance

In [13]:
def calculate_overall_variance(data_left, data_right):
    
    n = len(data_left) + len(data_right)
    p_data_left = len(data_left) / n
    p_data_right = len(data_right) / n

    overall_variance =  (p_data_left * calculate_variance(data_left) 
                      + p_data_right * calculate_variance(data_right))
    
    return overall_variance

In [14]:
def determine_best_split(data, potential_splits):
    
    overall_variance = 9999
    best_split_column = 0
    best_split_value = 0.000
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_left, data_right = split_data(data, split_feature=column_index, threshold=value)
            current_overall_variance = calculate_overall_variance(data_left, data_right)

            if current_overall_variance <= overall_variance:
                overall_variance = current_overall_variance
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

## Decision Tree Algorithm

### Representation of the Decision Tree

### Algorithm

In [15]:
def decision_tree_algorithm(df, counter=0, min_samples=20, max_depth=9):
    

    if counter == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df           
    
    

    if (check_purity(data)) or (len(data) < min_samples) or (counter == max_depth):
        classification = classify_data(data)
        
        return classification

    

    else:    
        counter += 1


        potential_splits = get_potential_splits(data)
        split_feature, threshold = determine_best_split(data, potential_splits)
        data_left, data_right = split_data(data, split_feature, threshold)
        

        feature_name = COLUMN_HEADERS[split_feature]
        deciding_factor = "{} <= {}".format(feature_name, threshold)
        sub_tree = {deciding_factor: []}
        

        ans_true = decision_tree_algorithm(data_left, counter, min_samples, max_depth)
        ans_false = decision_tree_algorithm(data_right, counter, min_samples, max_depth)
        

        if ans_true == ans_false:
            sub_tree = ans_true
        else:
            sub_tree[deciding_factor].append(ans_true)
            sub_tree[deciding_factor].append(ans_false)
        
        return sub_tree

In [16]:
tree = decision_tree_algorithm(train_df)
pprint(tree)

{'RM <= 6.9715': [{'LSTAT <= 15.0': [{'DIS <= 1.43365': [45.58,
                                                         {'RM <= 6.543': [{'LSTAT <= 7.57': [{'TAX <= 222.5': [28.7,
                                                                                                               {'DIS <= 4.4638': [25.220000000000002,
                                                                                                                                  {'TAX <= 228.5': [21.45,
                                                                                                                                                    {'TAX <= 262.0': [25.0,
                                                                                                                                                                      23.4304347826087]}]}]}]},
                                                                                             {'TAX <= 208.0': [25.983333333333334,
                   

In [17]:
d_tree = decision_tree_algorithm(d_train_df)
pprint(tree)

{'RM <= 6.9715': [{'LSTAT <= 15.0': [{'DIS <= 1.43365': [45.58,
                                                         {'RM <= 6.543': [{'LSTAT <= 7.57': [{'TAX <= 222.5': [28.7,
                                                                                                               {'DIS <= 4.4638': [25.220000000000002,
                                                                                                                                  {'TAX <= 228.5': [21.45,
                                                                                                                                                    {'TAX <= 262.0': [25.0,
                                                                                                                                                                      23.4304347826087]}]}]}]},
                                                                                             {'TAX <= 208.0': [25.983333333333334,
                   

## Classification

In [18]:

example = test_df.iloc[0]
example

CRIM         0.84054
ZN           0.00000
INDUS        8.14000
CHAS         0.00000
NOX          0.53800
RM           5.59900
AGE         85.70000
DIS          4.45460
RAD          4.00000
TAX        307.00000
PTRATIO     21.00000
B          303.42000
LSTAT       16.51000
label       13.90000
Name: 0, dtype: float64

In [19]:
def classify_example(example, tree):
    deciding_factor = list(tree.keys())[0]
    feature_name, comparison_operator, value = deciding_factor.split(" ")

    # deciding_factor condition
    if example[feature_name] <= float(value):
        answer = tree[deciding_factor][0]
    else:
        answer = tree[deciding_factor][1]

    # if answer is not dictionary we have reached final step and return the answer
    if not isinstance(answer, dict):
        return answer
    
    # the subtree needs to be divided further
    else:
        return classify_example(example, answer)

In [20]:
classify_example(example, tree)

16.32941176470588

## Calculate RMSE for test data

In [21]:
def calculate_rmse(df, tree):

    df["prediction"] = df.apply(classify_example, axis=1, args=(tree,))
    
    rmse = np.mean((df["label"] - df["prediction"])**2)
    
    return rmse

In [22]:
rmse = calculate_rmse(test_df, tree)
rmse

44.658415539547526

In [23]:
rmse = calculate_rmse(d_test_df, d_tree)
rmse

12.331074519000666

In [24]:
test_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,label,prediction
0,0.84054,0.0,8.14,0,0.538,5.599,85.7,4.4546,4,307.0,21.0,303.42,16.51,13.9,16.329412
1,0.67191,0.0,8.14,0,0.538,5.813,90.3,4.6820,4,307.0,21.0,376.88,14.81,16.6,18.271429
2,0.95577,0.0,8.14,0,0.538,6.047,88.8,4.4534,4,307.0,21.0,306.38,17.28,14.8,16.329412
3,0.77299,0.0,8.14,0,0.538,6.495,94.4,4.4547,4,307.0,21.0,387.94,12.80,18.4,18.271429
4,1.00245,0.0,8.14,0,0.538,6.674,87.3,4.2390,4,307.0,21.0,380.23,11.98,21.0,26.494737
5,0.22927,0.0,6.91,0,0.448,6.030,85.5,5.6894,3,233.0,17.9,392.74,18.80,16.6,20.575000
6,0.25387,0.0,6.91,0,0.448,5.399,95.3,5.8700,3,233.0,17.9,396.90,30.81,14.4,20.575000
7,0.21977,0.0,6.91,0,0.448,5.602,62.0,6.0877,3,233.0,17.9,396.90,16.20,19.4,20.575000
8,0.10153,0.0,12.83,0,0.437,6.279,74.5,4.0522,5,398.0,18.7,373.66,11.97,20.0,20.528000
9,0.08707,0.0,12.83,0,0.437,6.140,45.8,4.0905,5,398.0,18.7,386.96,10.27,20.8,20.528000
