# Import Library

In [2]:
import numpy
from scipy.stats import entropy
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plotLib
import pandas as pd
import pprint

# Preparing Dataset

## Load Dataset from xls

In [3]:
df = pd.read_excel("sources/sunburn.xlsx")
del df['Name']
df

Unnamed: 0,Hair,Height,Weight,Lotion,Result
0,blonde,average,light,no,sunburned
1,blonde,tall,average,yes,none
2,brown,short,average,yes,none
3,blonde,short,average,no,sunburned
4,red,average,heavy,no,sunburned
5,brown,tall,heavy,no,none
6,brown,average,heavy,no,none
7,blonde,short,light,yes,none


## Get Input Attributes

In [4]:
selectedInput = ['Hair', 'Height', 'Weight', 'Lotion']
inputAttributes = df[selectedInput]

## Get Input Attributes' values

In [5]:
hairValues = inputAttributes.Hair.unique()
heightValues = inputAttributes.Height.unique()
weightValues = inputAttributes.Weight.unique()
lotionValues = inputAttributes.Lotion.unique()

print(hairValues)
print(heightValues)
print(weightValues)
print(lotionValues)

['blonde' 'brown' 'red']
['average' 'tall' 'short']
['light' 'average' 'heavy']
['no' 'yes']


## Get target attributes

In [6]:
selectedTarget = ['Result']
targetAttribute = df[selectedTarget]

## Get target attributes' values

In [7]:
targetValues = targetAttribute.Result.unique()

## Count Instances and Target Distribution

In [8]:
targetDataFrame = df['Result'].value_counts()
totalInstance = df['Result'].value_counts().sum()

# Preparing attributes impurity

## Get Hair and its values

In [9]:
getBlonde = df.loc[df["Hair"] == "blonde"]
getBrown = df.loc[df["Hair"] == "brown"]
getRed = df.loc[df["Hair"] == "red"]

In [10]:
targetBlonde = getBlonde.Result.value_counts()
targetBrown = getBrown.Result.value_counts()
targetRed = getRed.Result.value_counts()

In [11]:
getBlondeSum = targetBlonde.sum()
getBrownSum = targetBrown.sum()
getRedSum = targetRed.sum()

## Get Height and its values

In [12]:
getAverage = df.loc[df['Height']=="average"]
getTall = df.loc[df['Height']=="tall"]
getShort = df.loc[df['Height']=="short"]

In [13]:
targetAverage = getAverage.Result.value_counts()
targetTall = getTall.Result.value_counts()
targetShort = getShort.Result.value_counts()

In [14]:
getAverageSum = targetAverage.sum()
getTallSum = targetTall.sum()
getShortSum = targetShort.sum()

## Get Weight and its values

In [15]:
getLight = df.loc[df['Weight']=="light"]
getAverage = df.loc[df['Weight']=="average"]
getHeavy = df.loc[df['Weight']=="heavy"]

In [16]:
targetLight = getLight.Result.value_counts()
targetAverage = getAverage.Result.value_counts()
targetHeavy = getHeavy.Result.value_counts()

In [17]:
getLightSum = targetLight.sum()
getAverageSum = targetAverage.sum()
getHeavySum = targetHeavy.sum()

## Get Lotion and its values

In [18]:
getNo = df.loc[df['Lotion']=="no"]
getYes = df.loc[df['Lotion']=="yes"]

In [19]:
targetNo = getNo.Result.value_counts()
targetYes = getYes.Result.value_counts()

In [20]:
getNoSum = targetNo.sum()
getYesSum = targetYes.sum()

# Let's Loop it

In [21]:
targetDataFrame

none         5
sunburned    3
Name: Result, dtype: int64

## Gini Function

In [22]:
def Gini(p, q):
    result = 1 - ((p**2)+(q**2))
    return result

## Calculate DF's base entropy and gini

In [23]:
p = targetDataFrame[0]/totalInstance
q = targetDataFrame[1]/totalInstance

baseEntropy = entropy([p,q], base=2)
baseGini = Gini(p, q)

print("Base Entropy:", baseEntropy)
print("Base Gini:", baseGini)

Base Entropy: 0.954434002924965
Base Gini: 0.46875


## Loop each attribute and loop each value

In [24]:
for attribute in selectedInput:
    print("Attribute: ", attribute)
    attributeValues = df[attribute].unique()
    #
    sumEntropy = 0
    sumGini = 0
    for value in attributeValues:
        subDataFrame = df.loc[df[attribute]==value]
        sumDataFrame = subDataFrame[selectedTarget].value_counts().sum()
        #
        print("Value: ", value, " , Total: ", sumDataFrame, " of ", totalInstance, " Instances")
        print(subDataFrame[selectedTarget].value_counts().to_frame())
        #
        totalIndex = subDataFrame[selectedTarget].value_counts().count()
        p = subDataFrame[selectedTarget].value_counts()[0]/sumDataFrame
        if(totalIndex == 1):
            q = 0
        else:
            q = subDataFrame[selectedTarget].value_counts()[1]/sumDataFrame
        #
        valueEntropy = (sumDataFrame/totalInstance)*(entropy([p,q], base=2))
        sumEntropy += valueEntropy

Attribute:  Hair
Value:  blonde  , Total:  4  of  8  Instances
           0
Result      
sunburned  2
none       2
Value:  brown  , Total:  3  of  8  Instances
        0
Result   
none    3
Value:  red  , Total:  1  of  8  Instances
           0
Result      
sunburned  1
Attribute:  Height
Value:  average  , Total:  3  of  8  Instances
           0
Result      
sunburned  2
none       1
Value:  tall  , Total:  2  of  8  Instances
        0
Result   
none    2
Value:  short  , Total:  3  of  8  Instances
           0
Result      
none       2
sunburned  1
Attribute:  Weight
Value:  light  , Total:  2  of  8  Instances
           0
Result      
sunburned  1
none       1
Value:  average  , Total:  3  of  8  Instances
           0
Result      
none       2
sunburned  1
Value:  heavy  , Total:  3  of  8  Instances
           0
Result      
none       2
sunburned  1
Attribute:  Lotion
Value:  no  , Total:  5  of  8  Instances
           0
Result      
sunburned  3
none       2
Value:  yes  ,

# Let's recursive it!

## Function to calculate Parent's entropy

In [25]:
def getParentEntropy(df, target):
    targetDataFrame = df[target].value_counts()
    totalInstance = df[target].value_counts().sum()
    #
    p = targetDataFrame[0]/totalInstance
    q = targetDataFrame[1]/totalInstance
    #
    baseEntropy = entropy([p,q], base=2)
    baseGini = Gini(p, q)
    #
    result = baseEntropy
    return result

## Function to Calculate Children's entropy

In [26]:
def getChildEntropy(df, target, attribute):
    totalInstance = df[target].value_counts().sum()
    attributeValues = df[attribute].unique()
    #
    sumEntropy = 0
    sumGini = 0
    for value in attributeValues:
        subDataFrame = df.loc[df[attribute]==value]
        sumDataFrame = subDataFrame[selectedTarget].value_counts().sum()
        #
        valueEntropy = 0
        valueGini = 0
        #
        totalIndex = subDataFrame[selectedTarget].value_counts().count()
        if(totalIndex <1):
            p=0
            q=0
        else:
            p = subDataFrame[selectedTarget].value_counts()[0]/sumDataFrame
            if(totalIndex == 1):
                q = 0
            else:
                q = subDataFrame[selectedTarget].value_counts()[1]/sumDataFrame
            #
            valueEntropy = (sumDataFrame/totalInstance)*(entropy([p,q], base=2))
            valueGini = (sumDataFrame/totalInstance)*(Gini(p,q))
        #
        sumEntropy += valueEntropy
        sumGini += valueGini
    result = sumEntropy
    return abs(result)

## Function to find best attribute

In [27]:
def findBestAttribute(df, target):
    gainList = []
    #
    parentEntropy = getParentEntropy(df, target)
    for attribute in df.keys()[:-1]:
        childEntropy = getChildEntropy(df, target, attribute)
        gain = parentEntropy - childEntropy
        gainList.append(gain)
        
    result = df.keys()[numpy.argmax(gainList)]
    return result

## Function to Split Dataset

In [28]:
def getSubBranch(df, attribute, value):
    return df[df[attribute]==value].reset_index(drop=True)

## Function Recursive ID3

In [29]:
def ID3(df, tree=None):
    selectedTarget = ['Result']
    bestAttribute = findBestAttribute(df, selectedTarget)
    bestAttValues = df[bestAttribute].unique()
    if tree is None:
        tree = {}
        tree[bestAttribute] = {}
    for value in bestAttValues:
        nextBranch = getSubBranch(df, bestAttribute, value)
        nextBranchValue, nextBranchCounts = numpy.unique(nextBranch[selectedTarget], return_counts=True)
        if len(nextBranchCounts)==1:
            tree[bestAttribute][value] = nextBranchValue[0]
        else:
            tree[bestAttribute][value] = ID3(nextBranch)
            
    result = tree
    return result

# Let's Run Our ID3

In [30]:
myTree = ID3(df)
pprint.pprint(myTree)

{'Hair': {'blonde': {'Lotion': {'no': 'sunburned', 'yes': 'none'}},
          'brown': 'none',
          'red': 'sunburned'}}
