# Import Library

In [1]:
import numpy
from scipy.stats import entropy
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plotLib
import pandas as pd
import pprint

# Preparing Dataset

## Load Dataset from xls

In [2]:
df = pd.read_excel("../sources/dataset-animal.xlsx")
df.head()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,mammal
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,mammal
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,fish
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,mammal
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,mammal


## Get Input Attributes

In [None]:
selectedInput = ['OUTLOOK', 'TEMPERATURE', 'HUMIDITY', 'WIND']
inputAttributes = df[selectedInput]

## Get Input Attributes' values

In [None]:
outlookValues = inputAttributes.OUTLOOK.unique()
temperatureValues = inputAttributes.TEMPERATURE.unique()
humidityValues = inputAttributes.HUMIDITY.unique()
windValues = inputAttributes.WIND.unique()

## Get target attributes

In [None]:
selectedTarget = ['PLAY']
targetAttribute = df[selectedTarget]

## Get target attributes' values

In [None]:
targetValues = targetAttribute.PLAY.unique()

## Count Instances and Target Distribution

In [None]:
targetDataFrame = df['PLAY'].value_counts()
totalInstance = df['PLAY'].value_counts().sum()

# Preparing attributes impurity

## Get OUTLOOK and its values

In [None]:
getOvercast = df.loc[df["OUTLOOK"] == "overcast"]
getRain = df.loc[df["OUTLOOK"] == "rain"]
getSunny = df.loc[df["OUTLOOK"] == "sunny"]

In [None]:
targetOvercast = getOvercast.PLAY.value_counts()
targetRain = getRain.PLAY.value_counts()
targetSunny = getSunny.PLAY.value_counts()

In [None]:
getOvercastSum = targetOvercast.sum()
getRainSum = targetRain.sum()
getSunnySum = targetSunny.sum()

## Get TEMPERATURE and its values

In [None]:
getHot = df.loc[df['TEMPERATURE']=="hot"]
getMild = df.loc[df['TEMPERATURE']=="mild"]
getCool = df.loc[df['TEMPERATURE']=="cool"]

In [None]:
targetHot = getHot.PLAY.value_counts()
targetMild = getMild.PLAY.value_counts()
targetCool = getCool.PLAY.value_counts()

In [None]:
getHotSum = targetHot.sum()
getMildSum = targetMild.sum()
getCoolSum = targetCool.sum()

## Get HUMIDITY and its values

In [None]:
getNormal = df.loc[df['HUMIDITY']=="normal"]
getHigh = df.loc[df['HUMIDITY']=="high"]

In [None]:
targetNormal = getNormal.PLAY.value_counts()
targetHigh = getHigh.PLAY.value_counts()

In [None]:
getNormalSum = targetNormal.sum()
getHighSum = targetHigh.sum()

## Get WINDY and its values

In [None]:
getWeak = df.loc[df['WIND']=="weak"]
getStrong = df.loc[df['WIND']=="strong"]

In [None]:
targetWeak = getWeak.PLAY.value_counts()
targetStrong = getStrong.PLAY.value_counts()

In [None]:
getWeakSum = targetWeak.sum()
getStrongSum = targetStrong.sum()

## Gini Function

In [None]:
def Gini(p, q):
    result = 1 - ((p**2)+(q**2))
    return result

## Function to calculate Parent's entropy

In [None]:
def getParentEntropy(df, target):
    targetDataFrame = df[target].value_counts()
    totalInstance = df[target].value_counts().sum()
    #
    p = targetDataFrame[0]/totalInstance
    q = targetDataFrame[1]/totalInstance
    #
    baseEntropy = entropy([p,q], base=2)
    baseGini = Gini(p, q)
    #
    result = baseEntropy
    return result

## Function to Calculate Children's entropy

In [None]:
def getChildEntropy(df, target, attribute):
    totalInstance = df[target].value_counts().sum()
    #print("Attribute : ", attribute)
    attributeValues = df[attribute].unique()
    #
    sumEntropy = 0
    sumGini = 0
    for value in attributeValues:
        subDataFrame = df.loc[df[attribute]==value]
        sumDataFrame = subDataFrame[selectedTarget].value_counts().sum()
        #
        valueEntropy = 0
        valueGini = 0
        #
        # print("Value : ", value, " , Total : ", sumDataFrame , " of ", totalInstance , " Instances")
        #
        totalIndex = subDataFrame[selectedTarget].value_counts().count()
        if(totalIndex <1):
            p=0
            q=0
        else:
            p = subDataFrame[selectedTarget].value_counts()[0]/sumDataFrame
            if(totalIndex == 1):
                q = 0
            else:
                q = subDataFrame[selectedTarget].value_counts()[1]/sumDataFrame
            #
            valueEntropy = (sumDataFrame/totalInstance)*(entropy([p,q], base=2))
            valueGini = (sumDataFrame/totalInstance)*(Gini(p,q))
        #
        sumEntropy += valueEntropy
        sumGini += valueGini
    
    #print("\n Entropy[", attribute, "] = ", sumEntropy)
    #print("Gini[", attribute, "] = ", sumGini)
    result = sumEntropy
    return abs(result)

## Function to find best attribute

In [None]:
def findBestAttribute(df, target):
    gainList = []
    #
    parentEntropy = getParentEntropy(df, target)
    for attribute in df.keys()[:-1]:
        childEntropy = getChildEntropy(df, target, attribute)
        gain = parentEntropy - childEntropy
        gainList.append(gain)
        # print(attribute)
        # print(gain, "\n")
     
    print("max = ", max(gainList))
    result = df.keys()[numpy.argmax(gainList)]
    return result

## Function to Split Dataset

In [None]:
def getSubBranch(df, attribute, value):
    return df[df[attribute]==value].reset_index(drop=True)

## Function Recursive ID3

In [None]:
def ID3(df, selectedTarget, branch, maxdepth, tree=None):
#     find best attribute for node and get that values
    bestAttribute = findBestAttribute(df, selectedTarget)
    bestAttValues = df[bestAttribute].unique()
    
    # Create empty storage to our tree
    if tree is None:
        tree = {}
        tree[bestAttribute] = {}
        
    # Recursive
    for value in bestAttValues:
        nextBranch = getSubBranch(df, bestAttribute, value)
        
        # Check Impurity of Next Branch
        nextBranchValue, nextBranchCounts = numpy.unique(nextBranch[selectedTarget], return_counts=True)
        
        # See how many target in next branch
        if len(nextBranchCounts)==1:
            # If Next Branch is PURE (Impurity = 0%)
            tree[bestAttribute][value] = nextBranchValue[0]
        else:
            # If Next Branch is not pure
            if branch <= maxdepth:
                ctrBranch = branch +1
                tree[bestAttribute][value] = ID3(nextBranch, selectedTarget, ctrBranch, maxdepth)
            
    result = tree
    return result

# Let's Run Our ID3

In [None]:
selectedTarget = ['PLAY']
myTree = ID3(df, selectedTarget, 0, 6)
pprint.pprint(myTree)