### Regression tree for Housing Data

In [57]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import random
from pprint import pprint

In [58]:
word_labels = ["CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", "PTRATIO", "B", "LSTAT", "MEDV"]
train_df = pd.read_csv("HousingData/housing_train.txt", delim_whitespace=True, names = word_labels, header=None) 
test_df = pd.read_csv("HousingData/housing_test.txt", delim_whitespace=True, names = word_labels, header=None) 

In [59]:
train_df.head()


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [74]:
# #column wise range
# df_range = train_df.apply(max) - train_df.apply(min)
# df_mean = train_df()

In [98]:
# train_df = (train_df - train_df.min()) / (train_df.max() - train_df.min())
# train_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.0,0.18,0.073442,0.0,0.304802,0.577505,0.641607,0.269203,0.0,0.227557,0.313953,1.0,0.092199,0.422222
1,0.000236,0.0,0.262406,0.0,0.160752,0.547998,0.782698,0.348962,0.043478,0.114823,0.604651,1.0,0.210213,0.368889
2,0.000236,0.0,0.262406,0.0,0.160752,0.694386,0.599382,0.348962,0.043478,0.114823,0.604651,0.989737,0.065248,0.66
3,0.000293,0.0,0.068281,0.0,0.137787,0.658555,0.441813,0.448545,0.086957,0.073069,0.709302,0.994276,0.034326,0.631111
4,0.000705,0.0,0.068281,0.0,0.137787,0.687105,0.528321,0.448545,0.086957,0.073069,0.709302,1.0,0.102128,0.693333


In [99]:
test_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.84054,0.0,8.14,0,0.538,5.599,85.7,4.4546,4,307.0,21.0,303.42,16.51,13.9
1,0.67191,0.0,8.14,0,0.538,5.813,90.3,4.682,4,307.0,21.0,376.88,14.81,16.6
2,0.95577,0.0,8.14,0,0.538,6.047,88.8,4.4534,4,307.0,21.0,306.38,17.28,14.8
3,0.77299,0.0,8.14,0,0.538,6.495,94.4,4.4547,4,307.0,21.0,387.94,12.8,18.4
4,1.00245,0.0,8.14,0,0.538,6.674,87.3,4.239,4,307.0,21.0,380.23,11.98,21.0


In [100]:
# data = train_df.values
# data[:5]

In [101]:
#mean of the dataset
mean_data = np.mean(train_df.iloc[:,-1])

In [92]:
def var(data, split_attribute_name, target_name="MEDV"):
    feature_values = np.unique(data[split_attribute_name])
    feature_variance = 0
    for value in feature_values:
        subset = data.query('{0}=={1}'.format(split_attribute_name, value)).reset_index()
        value_var = (len(subset)/len(data))*np.var(subset[target_name], ddof=1)
        feature_variance = feature_variance + value_var
    return feature_variance

In [93]:
def Classification(data,originaldata,features,min_instances,target_attribute_name,parent_node_class = None):

    #min instance : minimum dataobjects per node for pruning the tree(stopping criteria)
    
    #If all target_values have the same value, return the mean value of the target feature for this dataset
    if len(data) <= int(min_instances):
        return np.mean(data[target_attribute_name])
 
    
    #If the dataset is empty, return the mean target feature value in the original dataset
    elif len(data)==0:
        return np.mean(originaldata[target_attribute_name])
    
    #If the feature space is empty, return the mean target feature value of the direct parent node --> Note that
    #the direct parent node is that node which has called the current run of the algorithm and hence
    #the mean target feature value is stored in the parent_node_class variable.
    
    elif len(features) ==0:
        return parent_node_class
    
    #If none of the above holds true, grow the tree!
    
    else:
        #Set the default value for this node --> The mean target feature value of the current node
        parent_node_class = np.mean(data[target_attribute_name])
        #Select the feature which best splits the dataset
        item_values = [var(data,feature) for feature in features] #Return the variance for features in the dataset
        best_feature_index = np.argmin(item_values)
        best_feature = features[best_feature_index]
        
        #Create the tree structure. The root gets the name of the feature (best_feature) with the minimum variance.
        tree = {best_feature:{}}
        
        
        #Remove the feature with the lowest variance from the feature space
        features = [i for i in features if i != best_feature]
        
        #Grow a branch under the root node for each possible value of the root node feature
        
        for value in np.unique(data[best_feature]):
            value = value
            #Split the dataset along the value of the feature with the lowest variance and therewith create sub_datasets
            sub_data = data.where(data[best_feature] == value).dropna()
            
            #Call the Calssification algorithm for each of those sub_datasets with the new parameters --> Here the recursion comes in!
            subtree = Classification(sub_data,originaldata,features,min_instances,'MEDV',parent_node_class = parent_node_class)
            
            #Add the sub tree, grown from the sub_dataset to the tree under the root node
            tree[best_feature][value] = subtree
            
        return tree   
    
    

In [94]:
"""
Predict query instances
"""
    
def predict(query,tree,default = mean_data):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result = tree[key][query[key]] 
            except:
                return default
            result = tree[key][query[key]]
            if isinstance(result,dict):
                return predict(query,result)
            else:
                return result

In [95]:
def test(data,tree):
    #Create new query instances by simply removing the target feature column from the original dataset and 
    #convert it to a dictionary
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    
    #Create a empty DataFrame in whose columns the prediction of the tree are stored
    predicted = []
    #Calculate the RMSE
    for i in range(len(data)):
        predicted.append(predict(queries[i],tree,mean_data)) 
    RMSE = np.sqrt(np.sum(((data.iloc[:,-1]-predicted)**2)/len(data)))
    return RMSE

In [96]:
tree = Classification(train_df,train_df,train_df.columns[:-1],5,'MEDV')
pprint(tree)


{'CRIM': {-0.038967158266586026: 0.025173210161662685,
          -0.03893636132721733: 0.20739543238388503,
          -0.03889196427958728: 0.2185065434949961,
          -0.03889084030369791: 0.27850654349499604,
          -0.03888533282184001: -0.08816012317167066,
          -0.038882972472472334: 0.6029509879394405,
          -0.038877240195436556: 0.1940620990505516,
          -0.038876453412313994: 0.13850654349499603,
          -0.03886948476179991: 0.31961765460610714,
          -0.03886532605100925: 0.46961765460610716,
          -0.03884610606330105: 0.16072876571721828,
          -0.038838350629664405: 0.22295098793944046,
          -0.03882801005148222: 0.005173210161662722,
          -0.038818905846778336: 0.2251732101616627,
          -0.038812386786619996: 0.6029509879394405,
          -0.038807216497528906: 0.04072876571721823,
          -0.03879350399167861: 0.43183987682832925,
          -0.03879238001578924: 0.18295098793944053,
          -0.03875742436562988: -0.14149

In [97]:
print('Root mean square error (RMSE): ',test(test_df,tree))

Root mean square error (RMSE):  22.13561090799778
