In [50]:
import pandas as pd
import numpy as np
eps = np.finfo(float).eps
from numpy import log2 as log

In [51]:
data = pd.read_csv("data.csv", index_col="RID")
data = data.drop('age', axis=1)
data.head(10)

Unnamed: 0_level_0,income,student,credit rating,buys computer
RID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,high,no,excellent,yes
2,high,no,good,yes
3,high,no,excellent,no
4,medium,no,excellent,no
5,low,yes,excellent,no
6,low,yes,good,yes
7,low,yes,good,yes
8,medium,no,excellent,yes
9,low,yes,excellent,yes
10,medium,yes,excellent,no


In [52]:
data['buys computer'].mode().item()

'yes'

In [60]:
def find_entropy(df):
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy

In [61]:
def find_entropy_attribute(df,attribute):
  Class = df.keys()[-1]   #To make the code generic, changing target variable class name
  target_variables = df[Class].unique()  #This gives all 'Yes' and 'No'
  variables = df[attribute].unique()    #This gives different features in that attribute (like 'Hot','Cold' in Temperature)
  entropy2 = 0
  for variable in variables:
      entropy = 0
      for target_variable in target_variables:
          num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
          den = len(df[attribute][df[attribute]==variable])
          fraction = num/(den+eps)
          entropy += -fraction*log(fraction+eps)
      fraction2 = den/len(df)
      entropy2 += fraction2*entropy
  return abs(entropy2)

In [62]:
def find_winner(df, att_list):
    IG = []
    for key in att_list:
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return att_list[np.argmax(IG)]

def get_subtable(df, node,value):
  return df[df[node] == value].reset_index(drop=True)

In [63]:
def buildTree(df, att_list, tree=None): 
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    
    #Here we build our decision tree
    
    #Get attribute with maximum information gain
    if len(att_list) == 0:
        return data[Class].mode().item() 
    
    node = find_winner(df, att_list)
    att_list.remove(node)
    print(node)
    #Get distinct value of that attribute e.g Salary is node and Low,Med and High are values
    attValue = np.unique(df[node])
    
    #Create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}
    
   #We make loop to construct a tree by calling this function recursively. 
    #In this we check if the subset is pure and stops if it is pure. 

    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable[Class],return_counts=True)                        
        print("count:", counts)
        if len(counts)==1:#Checking purity of subset
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable, att_list.copy()) #Calling the function recursively 
                   
    return tree

In [64]:
att_list = list(data.keys()[:-1])
att_list

['income', 'student', 'credit rating']

In [65]:
att_list = list(data.keys()[:-1])
t = buildTree(data, att_list)

credit rating
count: [11  7]
income
count: [2 4]
student
count: [1 1]
count: [1 3]
count: [4 1]
student
count: [2]
count: [2 1]
count: [5 2]
student
count: [3 1]
count: [2 1]
count: [ 2 10]
student
count: [2 5]
income
count: [2]
count: [1]
count: [1 3]
count: [5]


In [66]:
import pprint

pprint.pprint(t)

{'credit rating': {'excellent': {'income': {'high': {'student': {'no': 'yes',
                                                                 'yes': 'yes'}},
                                            'low': {'student': {'no': 'no',
                                                                'yes': 'yes'}},
                                            'medium': {'student': {'no': 'yes',
                                                                   'yes': 'yes'}}}},
                   'good': {'student': {'no': {'income': {'high': 'yes',
                                                          'low': 'no',
                                                          'medium': 'yes'}},
                                        'yes': 'yes'}}}}
