In [1]:
import pandas as pd
import numpy as np

Data = pd.read_excel(r'PlayTennis.xlsx')
print(Data)

     outlook temperature humidity    wind playtennis
0      sunny         hot     high    weak         no
1      sunny         hot     high  strong         no
2   overcast         hot     high    weak        yes
3       rain        mild     high    weak        yes
4       rain        cool   normal    weak        yes
5       rain        cool   normal  strong         no
6   overcast        cool   normal  strong        yes
7      sunny        mild     high    weak         no
8      sunny        cool   normal    weak        yes
9       rain        mild   normal    weak        yes
10     sunny        mild   normal  strong        yes
11  overcast        mild     high  strong        yes
12  overcast         hot   normal    weak        yes
13      rain        mild     high  strong         no


In [2]:

#Calculating the entropy for the whole dataset (Based on no of yes's and no's)
def totalEntropy(df, labelname, labellist):
    rows = df.shape[0]
    entropy = 0
    
    for cname in labellist:
        cnt = df[df[labelname] == cname].shape[0]
        classEntropy = -(cnt/rows)*np.log2(cnt/rows)
        entropy += classEntropy #-(yes/tot(log2(yes/tot)+no/tot(log2(no/tot)))
        
    return entropy

0.9402859586706311


In [6]:
#Calculating entropy for a specific feature
def specificEntropy(fval, labelname, labellist):
    cnt = fval.shape[0]
    entropy = 0
    
    for cname in labellist : 
        rows = fval[fval[labelname] == cname].shape[0]
        classentropy =0
        if rows != 0:
            classProb = rows/cnt
            classentropy = -classProb * np.log2(classProb)
        entropy += classentropy
        
    return entropy


0.9709505944546686


In [8]:
#Calculating infoGain for each feature
def infoGain(fname, df,labelname, labellist):
    fvals = df[fname].unique()
    rows = df.shape[0]
    finfo = 0.0
    
    for fval in fvals:
        fdata = df[df[fname] == fval] #Storing the rows with gn feature val
        fvalCnt = fdata.shape[0]
        fvalEntropy = specificEntropy(fdata, labelname, labellist)
        fvalProb = fvalCnt / rows
        finfo += fvalProb*fvalEntropy
        
    return totalEntropy(df,labelname, labellist) - finfo


0.24674981977443933


In [9]:
#Finding the most informative feature 
def findMaxGain(df, labelname, labellist):
    flist = df.columns.drop(labelname)
    
    maxinfogain = -1
    maxinfofeat = None
    
    for f in flist:
        finfogain = infoGain(f,df,labelname, labellist) #finding gainvalue for each feature
        if maxinfogain < finfogain :
            maxinfogain = finfogain
            maxinfofeat = f                             #finding the maximum gain value
            
    return maxinfofeat


outlook


In [10]:
#Selecting the next most informative node and adding it as a branch to the dtree
def makeSubtree(fname, df, labelname, labellist):
    fvalcnt = df[fname].value_counts(sort = False)
    tree = {}
    
    for fval, cnt in fvalcnt.iteritems():
        fvalData = df[df[fname] == fval ]
        
        flag = False
        
        for cname in labellist : 
            classcnt = fvalData[fvalData[labelname] == cname].shape[0]
            
            if classcnt == cnt : 
                tree[fval] = cname
                df = df[df[fname] != fval]
                flag = True
        if not flag :
            tree[fval] ="?"
    return tree, df


In [11]:
def makeDTree(root, prev, df, labelname, labellist):
    if df.shape[0] !=0:
        maxInfo = findMaxGain(df, labelname, labellist)
        tree , df = makeSubtree(maxInfo, df, labelname,labellist)
        nextNode = None
        
        if prev != None:
            root[prev] = dict()
            root[prev][maxInfo] = tree
            nextNode = root[prev][maxInfo]
        else : 
            root[maxInfo] = tree
            nextNode = root[maxInfo]
        
        for node, branch in list(nextNode.items()) :
            if branch =='?':
                fvaldata = df[df[maxInfo] == node]
                makeDTree(nextNode,node, fvaldata, labelname, labellist)

In [14]:
def id3DecisionTree(df1, labelname):
    df = df1.copy()
    tree = {}
    labellist = df[labelname].unique()
    makeDTree(tree, None, df, labelname, labellist)
    return tree

In [15]:
dtree = id3DecisionTree(Data,'playtennis')
print(dtree)


{'outlook': {'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}, 'overcast': 'yes', 'rain': {'wind': {'weak': 'yes', 'strong': 'no'}}}}
