In [141]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Import House Data

In [2]:
houses=pd.read_csv("..//data//house_data.csv")

In [130]:
houses.head()
# in_sf column describes if the house is in san francisco or not
# rest of the columns are self explanatory

Unnamed: 0,in_sf,beds,bath,price,year_built,sqft,price_per_sqft,elevation,predicted_label
0,0,2.0,1.0,999000,1960,1000,999,10,0
1,0,2.0,2.0,2750000,2006,1418,1939,0,1
2,0,2.0,2.0,1350000,1900,2150,628,9,0
3,0,1.0,1.0,629000,1903,500,1258,9,0
4,0,0.0,1.0,439000,1930,500,878,10,0


In [142]:
X_train, X_test, y_train, y_test =train_test_split(
    houses.drop(['in_sf'],axis=1),houses['in_sf'],test_size=0.3,random_state=42)

In [145]:
houses_train=pd.concat([y_train,X_train],axis=1)

# Build Decision Tree Algorithm

In [12]:
#function to count the frequency of labels in dataset
def label_counts(data):
    label_count = {} 
    for row in data:
        label = row[0]
        if label not in label_count:
            label_count[label] = 0
        label_count[label] += 1
    return label_count

In [13]:
# function to calculate ginni index of a given data
def gini(data):
    label_count = label_counts(data)
    impurity = 1
    for label in label_count:
        label_prob = label_count[label] / float(len(data))
        impurity -= label_prob**2
    return impurity

In [85]:
# function to split dataset based on a column label and column value
def create_split(data,col,val):
    right = data[data[col]>val]
    left = data[data[col]<=val]
    return left,right

In [6]:
#function to calculate information gain acheived if the the data is split into two
def information_gain(left, right, impurity):
    p = float(len(left)) / (len(left) + len(right))
    gain = impurity - p * gini(left.values) - (1 - p) * gini(right.values)
    return gain

In [86]:
# builds a tree in for of python dictionary
def build_tree(data, label_col, tree_dict,depth,max_depth ):
    if depth==max_depth:
        end_dict={}
        end_dict["label_count"]=label_counts(data.values)
        end_dict['left']={}
        end_dict['right']={}
        return end_dict

    max_info_gain=-9999
    sel_col=None
    sel_val=None
    initial_impurity = gini(data.values)
    for col in data:
        if col != label_col:
            for val in data[col].unique().tolist():
                info_gain=0
                left , right = create_split(data,col,val)
                info_gain = information_gain(left,right,initial_impurity)

                if info_gain >max_info_gain:
                    max_info_gain=info_gain
                    sel_col = col
                    sel_val = val

    if sel_col==None:
        end_dict={}
        end_dict["label_count"]=label_counts(data.values)
        end_dict['left']={}
        end_dict['right']={}
        return end_dict

    tree_dict["col"]=sel_col
    tree_dict["val"]=sel_val
    tree_dict["label_count"]=label_counts(data.values)
    tree_dict["info_gain"]=max_info_gain
    tree_dict["impurity"]=initial_impurity
    left , right = create_split(data,sel_col,sel_val)
    tree_dict["left"]={}
    tree_dict["left"]=build_tree(left,label_col,tree_dict["left"],depth+1,max_depth)
    tree_dict["right"]={}
    tree_dict["right"]=build_tree(right,label_col,tree_dict["right"],depth+1,max_depth)

    return tree_dict

In [124]:
# function to predict labels for unlabelled data
def predict(decision_tree,data):
    traverse = True
    while traverse:
        col = decision_tree['col']
        val = decision_tree['val']
        if data[col]>val:
            decision_tree = decision_tree['right']
        else :
            decision_tree = decision_tree['left']
        if not decision_tree['right'] and not decision_tree['left']:
            traverse = False
    if 1.0 in decision_tree['label_count'] and 0.0 in decision_tree['label_count']:
        label_flag = 1 if decision_tree['label_count'][1.0]> decision_tree['label_count'][0.0] else 0
        return label_flag
    elif 1.0 in decision_tree['label_count']:
        return 1
    else:
        return 0
        

# Fit the training data

In [147]:
# fit the data
decision_tree=build_tree(houses_train,"in_sf",{},1,4)

In [125]:
# predict labels for training dataset
houses['predicted_label']=houses.apply(lambda row : predict(decision_tree,row),axis =1)

# Evaluate algorithm

In [136]:
# calculate accuracy on training dataset
train_prediction = houses['in_sf']==houses['predicted_label']
print("Training Accuracy : ",round(train_prediction.mean()*100,2),"%")

Training Accuracy :  89.63 %


In [155]:
# Predict house labels on test dataset
prediction_label=X_test.apply(lambda row : predict(decision_tree,row),axis =1)

In [157]:
# Calculate accuracy on Test Dataset
test_prediction=prediction_label.values==y_test
print("Test Accuracy : ",round(test_prediction.mean()*100,2),"%")

Test Accuracy :  90.54 %
