In [160]:
from random import randrange
from csv import reader
import math
import numpy as np
from numpy import log2 as log
import pandas as pd
import matplotlib.pyplot as plt

In [342]:
df = pd.read_csv('NURSERY_data.csv')
df

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,classname
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority
...,...,...,...,...,...,...,...,...,...
12955,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,priority,spec_prior
12956,great_pret,very_crit,foster,more,critical,inconv,slightly_prob,not_recom,not_recom
12957,great_pret,very_crit,foster,more,critical,inconv,problematic,recommended,spec_prior
12958,great_pret,very_crit,foster,more,critical,inconv,problematic,priority,spec_prior


In [162]:
train_data = df.sample(n = int(2*df.shape[0]/3), replace = True, random_state = 1)
train_data.reset_index(inplace = True) 
train_data=train_data.drop('index', axis=1)
train_data

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,classname
0,usual,proper,completed,1,less_conv,convenient,nonprob,priority,priority
1,great_pret,very_crit,complete,2,less_conv,convenient,slightly_prob,priority,spec_prior
2,pretentious,less_proper,complete,1,convenient,convenient,problematic,not_recom,not_recom
3,usual,less_proper,complete,1,critical,convenient,slightly_prob,not_recom,not_recom
4,great_pret,improper,incomplete,3,critical,inconv,nonprob,not_recom,not_recom
...,...,...,...,...,...,...,...,...,...
8635,pretentious,less_proper,incomplete,3,less_conv,convenient,slightly_prob,recommended,priority
8636,great_pret,critical,incomplete,3,convenient,inconv,problematic,not_recom,not_recom
8637,usual,critical,incomplete,3,less_conv,convenient,problematic,priority,spec_prior
8638,great_pret,improper,completed,3,critical,convenient,problematic,not_recom,not_recom


In [270]:
test_data = df.sample(n = int(df.shape[0]/3), replace = True, random_state = 2)
test_data.reset_index(inplace = True) 
test_data=test_data.drop('index', axis=1)
test_data

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,classname
0,pretentious,critical,completed,more,critical,inconv,nonprob,priority,spec_prior
1,usual,improper,foster,more,critical,convenient,nonprob,priority,priority
2,pretentious,improper,incomplete,3,critical,inconv,slightly_prob,priority,spec_prior
3,great_pret,critical,incomplete,3,less_conv,convenient,problematic,not_recom,not_recom
4,usual,improper,foster,3,less_conv,inconv,slightly_prob,recommended,priority
...,...,...,...,...,...,...,...,...,...
4315,pretentious,improper,completed,more,convenient,convenient,nonprob,not_recom,not_recom
4316,pretentious,improper,incomplete,3,less_conv,convenient,nonprob,priority,spec_prior
4317,pretentious,proper,foster,3,convenient,convenient,problematic,recommended,priority
4318,usual,less_proper,foster,more,critical,inconv,problematic,recommended,priority


In [280]:
def find_entropy(df):
    Class_name = df.keys()[-1]   #To make the code generic, changing target variable class name
    entropy = 0
    values = df[Class_name].unique()
    for value in values:
        fraction = df[Class_name].value_counts()[value]/len(df[Class_name])
        entropy += -fraction*np.log2(fraction)
    return entropy



def find_entropy_attribute(df,attribute):
    eps = np.finfo(float).eps
    classname = df.keys()[-1]   #To make the code generic, changing target variable class name
    target_variables = df.classname.unique()  #This gives all 'Yes' and 'No'
    variables = df[attribute].unique()    #This gives different features in that attribute (like 'Sweet')
    entropy_attribute = 0
    for variable in variables:
        entropy_each_feature = 0
        den = len(df[attribute][df[attribute] == variable])  #denominator
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute] == variable][df.classname == target_variable]) #numerator
            fraction = num / (den + eps)  #pi
            entropy_each_feature += -fraction*log(fraction + eps) #This calculates entropy for one feature like 'Sweet'
        fraction2 = den / len(df)
        entropy_attribute += fraction2 * entropy_each_feature   #Sums up all the entropy ETaste
    return abs(entropy_attribute)
    

def get_subtable(df, node,value):
    return df[df[node] == value].reset_index(drop=True)



def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)]

In [281]:
def buildTree(df,count,tree = None): 
    classname = df.keys()[-1]   #To make the code generic, changing target variable class name
    #Here we build our decision tree
    
    if count == 8:
        clValue1,counts1 = np.unique(df['classname'],return_counts = True)
        return clValue1[0]
    #Get attribute with maximum information gain
    node = find_winner(df)
    #Get distinct value of that attribute e.g Salary is node and Low,Med and High are values
    attValue = np.unique(df[node])
    
    #Create an empty dictionary to create tree    
    if tree is None:                    
        tree = {}
        tree[node] = {}
    
   #We make loop to construct a tree by calling this function recursively. 
    #In this we check if the subset is pure and stops if it is pure. 
    
    clValue2,counts2 = np.unique(df['classname'],return_counts=True)
    tree[node]['max_prob'] = clValue2[0]

    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable['classname'],return_counts=True)                               
        
        if len(counts) == 1:#Checking purity of subset
            tree[node][value] = clValue[0]
        else:        
            tree[node][value] = buildTree(subtable, count + 1) #Calling the function recursively
                   
    return tree

In [260]:
tree = buildTree(train_data,0)

In [262]:
def predict(inst,tree):
    #This function is used to predict for any input variable 
    
    #Recursively we go through the tree that we built earlier
    for nodes in tree.keys():        
        
        prediction = 0
        value = inst[nodes]
        if value in tree[nodes].keys():
            tree = tree[nodes][value]
            if type(tree) is dict:
                prediction = predict(inst, tree)
            else:
                prediction = tree
                break;    
        else:
            prediction = tree[nodes]['max_prob']
            break;
    return prediction

In [272]:
classnames = df.classname.unique()
i = 0
dictclass = {}
for item in classnames:
    dictclass[item] = i
    i +=1
print(dictclass)

{'recommend': 0, 'priority': 1, 'not_recom': 2, 'very_recom': 3, 'spec_prior': 4}


In [275]:
confusion_matrix = np.zeros([5,5])
for i in range(4320):
    confusion_matrix[dictclass[test_data.classname[i]],dictclass[predict(test_data.iloc[i],tree)]] += 1
df_confusion_matrix = pd.DataFrame(confusion_matrix, columns = classnames)
df_confusion_matrix.set_index(classnames, inplace = True)
print(df_confusion_matrix)

            recommend  priority  not_recom  very_recom  spec_prior
recommend         0.0       0.0        0.0         0.0         0.0
priority          0.0    1387.0        0.0         8.0        10.0
not_recom         0.0       0.0     1446.0         0.0         0.0
very_recom        2.0      16.0        0.0        91.0         0.0
spec_prior        0.0      39.0        0.0         0.0      1321.0


In [287]:
K = [2,3,4,5,6,7,8,9,10]
def calculate_accuracy(df_train,df_test):
    tree = buildTree(df_train,0)
    accuracy = 0
    for i in range(len(df_test["classname"])):
        if predict(df_test.iloc[i],tree) == df_test["classname"][i]:
            accuracy += 1
    accuracy = float(accuracy/len(df_test["classname"]))
    return accuracy

In [288]:
calculate_accuracy(train_data, test_data)


0.9826388888888888

In [381]:
def data_split(df1, k, i):
    if i == k:
        df = df1.copy()
        test_data = df[df.index >= df.index[(i-1)*int(len(df.index)/k)]].reset_index(drop=True)
        df = df1.copy()
        train_data = df[df.index < df.index[(i-1)*int(len(df.index)/k)]].reset_index(drop=True)
    else:
        int1 = (i-1)*int(len(df1.index)/k)
        int2 = i*int(len(df1.index)/k)
        df = df1.copy()
        df = df[df.index < df.index[int2]].reset_index(drop=True)
        test_data = df[df.index >= df.index[int1]].reset_index(drop=True)
        df = df1.copy()
        train_data = df.drop(list(range(int1,int2)),axis = 0)
    return train_data, test_data

In [382]:
def accuracy_kfold(k, df):
    accuracy = 0
    for i in range(k):
        train_datak, test_datak = data_split(df,k,i+1)
        accuracy += calculate_accuracy(train_datak, test_datak)
    accuracy = accuracy/k
    print(accuracy)

In [386]:
accuracy_kfold(3,df)

0.76929012345679


In [387]:
for i in range(8):
    accuracy_kfold(i+2,df)

0.7433641975308642
0.76929012345679
0.7729938271604938
0.7401234567901235
0.8177469135802468
0.8537330223220295
0.8402777777777778
0.8703703703703703


In [389]:
for i in range(10, 21):
    accuracy_kfold(i,df)

0.8339506172839506
0.8860987649799482
0.9166666666666666
0.9143533842329024
0.9266409266409267
0.76929012345679
0.9220679012345678


KeyboardInterrupt: 