# Import Library

In [2]:
import numpy
from scipy.stats import entropy
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plotLib
import pandas as pd
import pprint

# Preparing Dataset

## Load Dataset from xls

In [3]:
df = pd.read_excel("../sources/dataset-wellbeing.xlsx")
df

Unnamed: 0,EXPENSE,SALARY,RELATIONSHIP,WORKTIME,WELLBEING
0,low,high,married,normal,yes
1,low,high,married,overtime,yes
2,moderate,high,married,overtime,yes
3,high,high,married,overtime,yes
4,high,low,single,overtime,yes
5,moderate,low,single,overtime,no
6,low,moderate,married,overtime,yes
7,low,low,single,overtime,no
8,high,moderate,single,overtime,no
9,low,moderate,single,overtime,no


## Get Input Attributes

In [11]:
selectedInput = ['OUTLOOK', 'TEMPERATURE', 'HUMIDITY', 'WIND']
inputAttributes = df[selectedInput]

## Get Input Attributes' values

In [12]:
outlookValues = inputAttributes.OUTLOOK.unique()
temperatureValues = inputAttributes.TEMPERATURE.unique()
humidityValues = inputAttributes.HUMIDITY.unique()
windValues = inputAttributes.WIND.unique()

## Get target attributes

In [13]:
selectedTarget = ['PLAY']
targetAttribute = df[selectedTarget]

## Get target attributes' values

In [14]:
targetValues = targetAttribute.PLAY.unique()

## Count Instances and Target Distribution

In [15]:
targetDataFrame = df['PLAY'].value_counts()
totalInstance = df['PLAY'].value_counts().sum()

# Preparing attributes impurity

## Get OUTLOOK and its values

In [8]:
getOvercast = df.loc[df["OUTLOOK"] == "overcast"]
getRain = df.loc[df["OUTLOOK"] == "rain"]
getSunny = df.loc[df["OUTLOOK"] == "sunny"]

In [10]:
targetOvercast = getOvercast.PLAY.value_counts()
targetRain = getRain.PLAY.value_counts()
targetSunny = getSunny.PLAY.value_counts()

In [17]:
getOvercastSum = targetOvercast.sum()
getRainSum = targetRain.sum()
getSunnySum = targetSunny.sum()

## Get TEMPERATURE and its values

In [18]:
getHot = df.loc[df['TEMPERATURE']=="hot"]
getMild = df.loc[df['TEMPERATURE']=="mild"]
getCool = df.loc[df['TEMPERATURE']=="cool"]

In [19]:
targetHot = getHot.PLAY.value_counts()
targetMild = getMild.PLAY.value_counts()
targetCool = getCool.PLAY.value_counts()

In [20]:
getHotSum = targetHot.sum()
getMildSum = targetMild.sum()
getCoolSum = targetCool.sum()

## Get HUMIDITY and its values

In [21]:
getNormal = df.loc[df['HUMIDITY']=="normal"]
getHigh = df.loc[df['HUMIDITY']=="high"]

In [22]:
targetNormal = getNormal.PLAY.value_counts()
targetHigh = getHigh.PLAY.value_counts()

In [23]:
getNormalSum = targetNormal.sum()
getHighSum = targetHigh.sum()

## Get WINDY and its values

In [24]:
getWeak = df.loc[df['WIND']=="weak"]
getStrong = df.loc[df['WIND']=="strong"]

In [25]:
targetWeak = getWeak.PLAY.value_counts()
targetStrong = getStrong.PLAY.value_counts()

In [26]:
getWeakSum = targetWeak.sum()
getStrongSum = targetStrong.sum()

# Let's Loop it

In [27]:
targetDataFrame

yes    9
no     5
Name: PLAY, dtype: int64

## Gini Function

In [28]:
def Gini(p, q):
    result = 1 - ((p**2)+(q**2))
    return result

## Calculate DF's base entropy and gini

In [29]:
p = targetDataFrame[0]/totalInstance
q = targetDataFrame[1]/totalInstance

baseEntropy = entropy([p,q], base=2)
baseGini = Gini(p, q)

print("Base Entropy:", baseEntropy)
print("Base Gini:", baseGini)

Base Entropy: 0.940285958670631
Base Gini: 0.4591836734693877


## Loop each attribute and loop each value

In [30]:
for attribute in selectedInput:
    print("Attribute: ", attribute)
    attributeValues = df[attribute].unique()
    #
    sumEntropy = 0
    sumGini = 0
    for value in attributeValues:
        subDataFrame = df.loc[df[attribute]==value]
        sumDataFrame = subDataFrame[selectedTarget].value_counts().sum()
        #
        print("Value: ", value, " , Total: ", sumDataFrame, " of ", totalInstance, " Instances")
        print(subDataFrame[selectedTarget].value_counts().to_frame())
        #
        totalIndex = subDataFrame[selectedTarget].value_counts().count()
        p = subDataFrame[selectedTarget].value_counts()[0]/sumDataFrame
        if(totalIndex == 1):
            q = 0
        else:
            q = subDataFrame[selectedTarget].value_counts()[1]/sumDataFrame
        #
        valueEntropy = (sumDataFrame/totalInstance)*(entropy([p,q], base=2))
        sumEntropy += valueEntropy

Attribute:  OUTLOOK
Value:  sunny  , Total:  5  of  14  Instances
      0
PLAY   
no    3
yes   2
Value:  overcast  , Total:  4  of  14  Instances
      0
PLAY   
yes   4
Value:  rain  , Total:  5  of  14  Instances
      0
PLAY   
yes   3
no    2
Attribute:  TEMPERATURE
Value:  hot  , Total:  4  of  14  Instances
      0
PLAY   
yes   2
no    2
Value:  mild  , Total:  6  of  14  Instances
      0
PLAY   
yes   4
no    2
Value:  cool  , Total:  4  of  14  Instances
      0
PLAY   
yes   3
no    1
Attribute:  HUMIDITY
Value:  high  , Total:  7  of  14  Instances
      0
PLAY   
no    4
yes   3
Value:  normal  , Total:  7  of  14  Instances
      0
PLAY   
yes   6
no    1
Attribute:  WIND
Value:  weak  , Total:  8  of  14  Instances
      0
PLAY   
yes   6
no    2
Value:  strong  , Total:  6  of  14  Instances
      0
PLAY   
yes   3
no    3


# Let's recursive it!

## Function to calculate Parent's entropy

In [31]:
def getParentEntropy(df, target):
    targetDataFrame = df[target].value_counts()
    totalInstance = df[target].value_counts().sum()
    #
    p = targetDataFrame[0]/totalInstance
    q = targetDataFrame[1]/totalInstance
    #
    baseEntropy = entropy([p,q], base=2)
    baseGini = Gini(p, q)
    #
    result = baseEntropy
    return result

## Function to Calculate Children's entropy

In [15]:
def getChildEntropy(df, target, attribute):
    totalInstance = df[target].value_counts().sum()
    attributeValues = df[attribute].unique()
    #
    sumEntropy = 0
    sumGini = 0
    for value in attributeValues:
        subDataFrame = df.loc[df[attribute]==value]
        sumDataFrame = subDataFrame[selectedTarget].value_counts().sum()
        #
        valueEntropy = 0
        valueGini = 0
        #
        totalIndex = subDataFrame[selectedTarget].value_counts().count()
        if(totalIndex <1):
            p=0
            q=0
        else:
            p = subDataFrame[selectedTarget].value_counts()[0]/sumDataFrame
            if(totalIndex == 1):
                q = 0
            else:
                q = subDataFrame[selectedTarget].value_counts()[1]/sumDataFrame
            #
            valueEntropy = (sumDataFrame/totalInstance)*(entropy([p,q], base=2))
            valueGini = (sumDataFrame/totalInstance)*(Gini(p,q))
        #
        sumEntropy += valueEntropy
        sumGini += valueGini
    result = sumEntropy
    return abs(result)

## Function to find best attribute

In [33]:
def findBestAttribute(df, target):
    gainList = []
    #
    parentEntropy = getParentEntropy(df, target)
    for attribute in df.keys()[:-1]:
        childEntropy = getChildEntropy(df, target, attribute)
        gain = parentEntropy - childEntropy
        gainList.append(gain)
        
    result = df.keys()[numpy.argmax(gainList)]
    return result

## Function to Split Dataset

In [34]:
def getSubBranch(df, attribute, value):
    return df[df[attribute]==value].reset_index(drop=True)

## Function Recursive ID3

In [35]:
def ID3(df, tree=None):
    selectedTarget = ['PLAY']
    bestAttribute = findBestAttribute(df, selectedTarget)
    bestAttValues = df[bestAttribute].unique()
    if tree is None:
        tree = {}
        tree[bestAttribute] = {}
    for value in bestAttValues:
        nextBranch = getSubBranch(df, bestAttribute, value)
        nextBranchValue, nextBranchCounts = numpy.unique(nextBranch[selectedTarget], return_counts=True)
        if len(nextBranchCounts)==1:
            tree[bestAttribute][value] = nextBranchValue[0]
        else:
            tree[bestAttribute][value] = ID3(nextBranch)
            
    result = tree
    return result

# Let's Run Our ID3

In [37]:
myTree = ID3(df)
pprint.pprint(myTree)

{'OUTLOOK': {'overcast': 'yes',
             'rain': {'WIND': {'strong': 'no', 'weak': 'yes'}},
             'sunny': {'HUMIDITY': {'high': 'no', 'normal': 'yes'}}}}
