# Import Library

In [1]:
import numpy
from scipy import stats as st
from scipy.stats import entropy
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plotLib
import pandas as pd
import pprint

# Preparing Dataset

## Load Dataset from xls

In [2]:
df = pd.read_excel("../sources/dataset-animal.xlsx")
df = df[['milk', 'aquatic', 'fins', 'breathes', 'legs', 'toothed', 'class_type']]
df.head()

Unnamed: 0,milk,aquatic,fins,breathes,legs,toothed,class_type
0,1,0,0,1,4,1,mammal
1,1,0,0,1,4,1,mammal
2,0,1,1,0,0,1,fish
3,1,0,0,1,4,1,mammal
4,1,0,0,1,4,1,mammal


## Gini Function

In [5]:
def Gini(p, q):
    result = 1 - ((p**2)+(q**2))
    return result

## Function to calculate Parent's entropy

In [6]:
def getParentEntropy(df, target):
    targetDataFrame = df[target].value_counts()
    totalInstance = df[target].value_counts().sum()
    #
    p = targetDataFrame[0]/totalInstance
    q = targetDataFrame[1]/totalInstance
    #
    baseEntropy = entropy([p,q], base=2)
    baseGini = Gini(p, q)
    #
    result = baseEntropy
    return result

## Function to Calculate Children's entropy

In [7]:
def getChildEntropy(df, target, attribute):
    totalInstance = df[target].value_counts().sum()
    #print("Attribute : ", attribute)
    attributeValues = df[attribute].unique()
    #
    sumEntropy = 0
    sumGini = 0
    for value in attributeValues:
        subDataFrame = df.loc[df[attribute]==value]
        sumDataFrame = subDataFrame[selectedTarget].value_counts().sum()
        #
        valueEntropy = 0
        valueGini = 0
        #
        # print("Value : ", value, " , Total : ", sumDataFrame , " of ", totalInstance , " Instances")
        #
        totalIndex = subDataFrame[selectedTarget].value_counts().count()
        if(totalIndex <1):
            p=0
            q=0
        else:
            p = subDataFrame[selectedTarget].value_counts()[0]/sumDataFrame
            if(totalIndex == 1):
                q = 0
            else:
                q = subDataFrame[selectedTarget].value_counts()[1]/sumDataFrame
            #
            valueEntropy = (sumDataFrame/totalInstance)*(entropy([p,q], base=2))
            valueGini = (sumDataFrame/totalInstance)*(Gini(p,q))
        #
        sumEntropy += valueEntropy
        sumGini += valueGini
    
#     print("\n Entropy[", attribute, "] = ", sumEntropy)
#     print("Gini[", attribute, "] = ", sumGini, "\n")
    result = sumEntropy
    return abs(result)

## Function to find best attribute

In [8]:
def findBestAttribute(df, target):
    gainList = []
    #
    parentEntropy = getParentEntropy(df, target)
    for attribute in df.keys()[:-1]:
        childEntropy = getChildEntropy(df, target, attribute)
        gain = parentEntropy - childEntropy
        gainList.append(gain)
        print(attribute, ": ", gain)
     
    print("max = ", max(gainList))
    print("====================================")
    result = df.keys()[numpy.argmax(gainList)]
    return result

## Function to Split Dataset

In [9]:
def getSubBranch(df, attribute, value):
    return df[df[attribute]==value].reset_index(drop=True)

## Function Recursive ID3

In [10]:
def ID3(df, selectedTarget, branch, maxdepth, tree=None):
#     find best attribute for node and get that values
    bestAttribute = findBestAttribute(df, selectedTarget)
    bestAttValues = df[bestAttribute].unique()
    
    # Create empty storage to our tree
    if tree is None:
        tree = {}
        tree[bestAttribute] = {}
        
    # Recursive
    for value in bestAttValues:
        nextBranch = getSubBranch(df, bestAttribute, value)
        
        # Check Impurity of Next Branch
        nextBranchValue, nextBranchCounts = numpy.unique(nextBranch[selectedTarget], return_counts=True)
        
        # See how many target in next branch
        if len(nextBranchCounts)==1:
            # If Next Branch is PURE (Impurity = 0%)
            tree[bestAttribute][value] = nextBranchValue[0]
        else:
            # If Next Branch is not pure
            if branch <= maxdepth:
                ctrBranch = branch +1
                tree[bestAttribute][value] = ID3(nextBranch, selectedTarget, ctrBranch, maxdepth)
            
    result = tree
    return result

# Let's Run Our ID3

In [13]:
selectedTarget = ['class_type']
myTree = ID3(df, selectedTarget, 0, 6)
pprint.pprint(myTree)

milk :  0.33810359405670387
aquatic :  0.03655977689675527
fins :  0.0027493043110222715
breathes :  -0.004435761296349261
legs :  0.24839696734258854
toothed :  0.0636400497383467
max =  0.33810359405670387
milk :  0.0
aquatic :  0.044592747297443114
fins :  0.24796304227087784
breathes :  0.07934258999716237
legs :  0.47746338709222075
toothed :  0.09490818164103232
max =  0.47746338709222075
milk :  0.0
aquatic :  0.265964692606945
fins :  0.4422967385892809
breathes :  0.1339189809589446
legs :  0.0
toothed :  0.23015677810115232
max =  0.4422967385892809
milk :  0.0
aquatic :  0.0059777114237737905
fins :  0.0
breathes :  0.020244207153756077
legs :  0.0
toothed :  0.9852281360342515
max =  0.9852281360342515
milk :  0.0
aquatic :  0.40263290913494487
fins :  0.0
breathes :  0.13118511915064135
legs :  0.0
toothed :  0.11691862342065917
max =  0.40263290913494487
milk :  0.0
aquatic :  0.0
fins :  0.0
breathes :  0.7219280948873623
legs :  0.0
toothed :  0.7219280948873623
max =  