In [18]:
import pandas as pd
import math

colNames = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
carData = pd.read_csv('car.data', names = colNames, header = None)
'''
   buying       v-high, high, med, low
   maint        v-high, high, med, low
   doors        2, 3, 4, 5-more
   persons      2, 4, more
   lug_boot     small, med, big
   safety       low, med, high
   
   class      N          N[%]
   -----------------------------
   unacc     1210     (70.023 %) 
   acc        384     (22.222 %) 
   good        69     ( 3.993 %) 
   v-good      65     ( 3.762 %) 
   
   Number of Instances: 1728
'''   
carData.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


$$ Ent(S) = \sum_{i=1}^c -p_i * \log_{c} * p_i = 0.602870485$$

In [19]:
def gain(data, attribute, initEntropy, classCol = 'class', classes = ['unacc', 'acc', 'good', 'vgood']):
    
    classCounts = {}  #Attributwert: [Anzahl class1, Anzahl class2, ...] i.e. vhigh: [360 , 72, 0, 0]
    instances = data.shape[0]
    gain = initEntropy
    base = len(classes)
    
    for index, row in data.iterrows():
        idx = classes.index(row[classCol])
        attributeValue = row[attribute]
        if attributeValue not in classCounts:
            classCounts[attributeValue] = [1 if i == idx else 0 for i in range(base)]
        else:
            classCounts[attributeValue][idx] += 1       
    
    for key, value in classCounts.items():
        instancesLeft = sum(value)
        probs = [x/instancesLeft for x in value]
        entropy = 0
        for prob in probs:
            if prob != 0:
                entropy -= prob * math.log(prob, base)
        gain -= (instancesLeft/instances) * entropy
    return gain

In [20]:
print ('gain \"buying\"\t =  '+ str(gain(carData, 'buying', 0.602870485)))
print ('gain \"maint\"\t =  '+ str(gain(carData, 'maint', 0.602870485)))
print ('gain \"doors\"\t =  '+ str(gain(carData, 'doors', 0.602870485)))
print ('gain \"persons\"\t =  '+ str(gain(carData, 'persons', 0.602870485)))
print ('gain \"lug_boot\"\t =  '+ str(gain(carData, 'lug_boot', 0.602870485)))
print ('gain \"safety\"\t =  '+ str(gain(carData, 'safety', 0.602870485)))

gain "buying"	 =  0.04822448457871922
gain "maint"	 =  0.03685197345465521
gain "doors"	 =  0.002242858307228368
gain "persons"	 =  0.10983148166386641
gain "lug_boot"	 =  0.015004070617715026
gain "safety"	 =  0.13109217827104425


### safety -> highest information gain

In [14]:
class Node:
    def __init__(self, data = 'root'):
        self.data = data
        self.childs = {}
    def __str__(self):
        return str(self.data)

def printTree(node, level = 0, path = ''):
    if level == 0:
        print('Tree: ' + str(node))
    else:
        print ('\t' * level + '└' + '──' + path + '──' + ' ' + str(node))
    if node.childs:
        for key in node.childs:
            printTree(node.childs[key], level + 1, key)

root = Node()
root.childs = {'a1':Node('B'), 'a2':Node('+')}
root.childs['a1'].childs = {'b1':Node('-'), 'b2':Node('-'), 'b3':Node('*')}
printTree(root)

Tree: root
	└──a1── B
		└──b1── -
		└──b2── -
		└──b3── *
	└──a2── +
