In [3]:
import pandas as pd
import numpy as np

Data = pd.read_csv(r"E:\MIT-6\ML\ML Algos Lab\Data\PlayTennis.csv")
print(Data)

     Outlook Temperature Humidity    Wind Play Tennis
0      Sunny         Hot     High    Weak          No
1      Sunny         Hot     High  Strong          No
2   Overcast         Hot     High    Weak         Yes
3       Rain        Mild     High    Weak         Yes
4       Rain        Cool   Normal    Weak         Yes
5       Rain        Cool   Normal  Strong          No
6   Overcast        Cool   Normal  Strong         Yes
7      Sunny        Mild     High    Weak          No
8      Sunny        Cool   Normal    Weak         Yes
9       Rain        Mild   Normal    Weak         Yes
10     Sunny        Mild   Normal  Strong         Yes
11  Overcast        Mild     High  Strong         Yes
12  Overcast         Hot   Normal    Weak         Yes
13      Rain        Mild     High  Strong          No


In [4]:

#Calculating the entropy for the whole dataset (Based on no of yes's and no's)
def totalEntropy(df, labelname, labellist):
    rows = df.shape[0]
    entropy = 0
    
    for cname in labellist:
        cnt = df[df[labelname] == cname].shape[0]
        classEntropy = -(cnt/rows)*np.log2(cnt/rows)
        entropy += classEntropy #-(yes/tot(log2(yes/tot)+no/tot(log2(no/tot)))
        
    return entropy

In [5]:
#Calculating entropy for a specific feature
def specificEntropy(fval, labelname, labellist):
    cnt = fval.shape[0]
    entropy = 0
    
    for cname in labellist : 
        rows = fval[fval[labelname] == cname].shape[0]
        classentropy =0
        if rows != 0:
            classProb = rows/cnt
            classentropy = -classProb * np.log2(classProb)
        entropy += classentropy
        
    return entropy


In [6]:
#Calculating split info gain ratio for each feature
def splitInfoGainRatio(fname, df,labelname, labellist):
    fvals = df[fname].unique()
    rows = df.shape[0]
    finfo = 0.0
    splitinfo = 0.0
    
    for fval in fvals:
        fdata = df[df[fname] == fval] #Storing the rows with gn feature val
        fvalCnt = fdata.shape[0]
        fvalProb = fvalCnt / rows # count of that value by the total number of rows.
        finfo += fvalProb*specificEntropy(fdata, labelname, labellist)
        splitinfo -= fvalProb*np.log2(fvalProb) #(nothig just -3/4*log2(3/4)-1/4*log2(1/4))
    #if split=0, gain ratio will lead to div by 0 so instead dir return 0    
    if splitinfo == 0:
        return 0
    else:
        return (totalEntropy(df,labelname, labellist) - finfo) / splitinfo # gain(which is infogain(feature)-infoD)/split info gain



In [7]:

#Finding the most informative feature using split info gain ratio
def findMaxGain(df, labelname, labellist):
    flist = df.columns.drop(labelname)
    
    maxinfogain = -1
    maxinfofeat = None
    
    for f in flist:
        finfogain = splitInfoGainRatio(f,df,labelname, labellist) #finding gainvalue for each feature
        if maxinfogain < finfogain :
            maxinfogain = finfogain
            maxinfofeat = f                             #finding the maximum gain value
            
    return maxinfofeat

In [8]:
#Selecting the next most informative node and adding it as a branch to the dtree
def makeSubtree(fname, df, labelname, labellist):
    fvalcnt = df[fname].value_counts(sort = False)
    tree = {}
    
    for fval, cnt in fvalcnt.iteritems():
        fvalData = df[df[fname] == fval ]
        
        flag = False
        
        for cname in labellist : 
            classcnt = fvalData[fvalData[labelname] == cname].shape[0]
            
            if classcnt == cnt : 
                tree[fval] = cname
                df = df[df[fname] != fval]
                flag = True
        if not flag :
            tree[fval] ="?"
    return tree, df


In [9]:
def makeDTree(root, prev, df, labelname, labellist):
    if df.shape[0] !=0:
        maxInfo = findMaxGain(df, labelname, labellist)
        tree , df = makeSubtree(maxInfo, df, labelname,labellist)
        nextNode = None
        
        if prev != None:
            root[prev] = dict()
            root[prev][maxInfo] = tree
            nextNode = root[prev][maxInfo]
        else : 
            root[maxInfo] = tree
            nextNode = root[maxInfo]
        
        for node, branch in list(nextNode.items()) :
            if branch =='?':
                fvaldata = df[df[maxInfo] == node]
                makeDTree(nextNode,node, fvaldata, labelname, labellist)

In [10]:
def id3DecisionTree(df1, labelname):
    df = df1.copy()
    tree = {}
    labellist = df[labelname].unique()
    makeDTree(tree, None, df, labelname, labellist)
    return tree

In [11]:
dtree = id3DecisionTree(Data,'Play Tennis')
print(dtree)


{'Outlook': {'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}, 'Sunny': {'Humidity': {'Normal': 'Yes', 'High': 'No'}}, 'Overcast': 'Yes'}}
