In [224]:
# import needed library
from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import math

In [225]:
#load iris dataset
data_iris = load_iris()
iris_X, iris_y = load_iris(return_X_y=True)
feature_iris = data_iris['feature_names']

#load play tennis dataset
play_tennis =  pd.read_csv('play_tennis.csv')
play_tennis = play_tennis.drop('day',axis=1)

In [226]:
#transform iris into dataframe
iris_X=pd.DataFrame(iris_X)
iris_y=pd.DataFrame(iris_y)

In [227]:
#create index so be merge
iris_X=iris_X.reset_index()
iris_y=iris_y.reset_index()

In [228]:
iris_y.rename(columns = {0:4}, inplace = True) 

In [229]:
#merge dataset iris
iris=iris_X.merge(iris_y)

In [230]:
#drop index
iris.drop("index",axis=1,inplace=True)

In [231]:
iris.rename(columns = {0:feature_iris[0],1:feature_iris[1],2:feature_iris[2],3:feature_iris[3],4:"target"}, inplace = True)

In [232]:
def entropy(parsed_data, target_attribute):
    parsed_value_target = {}
    total_value_target = 0
  
    for i in parsed_data[target_attribute]:
        if i is not None:
            if i not in parsed_value_target:
                parsed_value_target[i] = 1
            else:
                parsed_value_target[i] += 1

            total_value_target += 1
  
    log_result = 0

    for i in parsed_value_target:
        log_result += float(parsed_value_target[i])/total_value_target * math.log((float(parsed_value_target[i])/total_value_target), 2)
  
    return -1 * log_result

In [233]:
# hasn't handle after universal entropy
def information_gain(data, gain_attribute, target_attribute):
    gain_result = 0
    attribute_entropy_result = 0
    parsed_attribute_count = {}
    total_attribute_count = 0
    
    for i in data[gain_attribute]:
        if i is not None:
            if i not in parsed_attribute_count:
                parsed_attribute_count[i] = 1
            else:
                parsed_attribute_count[i] += 1
            
            total_attribute_count += 1
    
    for i in parsed_attribute_count:
        parsed_data = data.loc[data[gain_attribute]==i]
        attribute_entropy_result += float(parsed_attribute_count[i])/total_attribute_count * entropy(parsed_data, target_attribute)    

    gain_result += entropy(data,target_attribute) + (-1 * attribute_entropy_result)
    return gain_result

In [234]:
def best_attribute(data,target_attribute):
    gain_attribute = {
        'value': 0,
        'name': ''
    }
    
    
    for i in data.columns:
        if (i != target_attribute):
            if information_gain(data, i, target_attribute) > gain_attribute['value']:
                gain_attribute['value'] = information_gain(data, i, target_attribute)
                gain_attribute['name'] = i

    return gain_attribute['name']

In [235]:
entropy(play_tennis,"play")

0.9402859586706309

In [236]:
information_gain(play_tennis, "outlook", "play")

0.2467498197744391

In [237]:
best_attribute(play_tennis,'play')

'outlook'

In [238]:
import math

class Node:
    def __init__(self, attribute=None, label=None):
        self.attribute = attribute
        self.label = label
        self.children = {}
        self.isDaun = False
  
    def setIsDaun(self, isDaun):
        self.isDaun = isDaun
        
    def setAttribute(self, attribute):
        self.attribute = attribute

    def setLabel(self, label):
        self.label = label
  
    def addChildren(self, attributeValue, node):
        self.children[attributeValue] = node
    
    def getChildren(self):
        return self.children
    
    def isEmpty(self):
        return len(self.children)==0
    
    def getLabel(self):
        return self.label

In [239]:
def get_most_common_label(data, target_attribute):
    parsed_value_target = {}
  
    for i in data[target_attribute]:
        if i is not None:
            if i not in parsed_value_target:
                parsed_value_target[i] = 1
            else:
                parsed_value_target[i] += 1

    most_common = {
        'value': 0,
        'name': ''
    }
    
    for i in parsed_value_target:
        if parsed_value_target[i] > most_common['value']:
            most_common['value'] = parsed_value_target[i]
            most_common['name'] = i
    
    return most_common['name']

In [240]:
play_tennis['play'].unique()[0]

'No'

In [241]:
def id3(data, target_attribute):
    node = Node()
    if data[target_attribute].nunique()==1:
        node.setLabel(data[target_attribute].unique()[0])
        return node
     
    elif len(play_tennis.columns)==1:
        node.setLabel(get_most_common_label(data, target_attribute))
        return node
    
    else:
        best_attribute_ = best_attribute(data,target_attribute)
        node.setAttribute(best_attribute_)
        for i in data[best_attribute_].unique():
            node.addChildren(i,id3(data.loc[data[best_attribute_]==i],target_attribute))
            
    return node

In [242]:
play_tennis['play'][0]

'No'

In [243]:
id3(play_tennis,"play")

<__main__.Node at 0x20f24021780>

In [244]:
def print_tree(node,depth):
    if node.label is not None: 
        print("    "*(depth+1) +node.label)
    else:
        print("    "*depth + "["+ node.attribute +"]")
        for i in node.children:
            print("----"*(depth+1) +i)
            print_tree(node.children[i],depth+1)        
        

In [245]:
print_tree(id3(play_tennis, "play"),0)

[outlook]
----Sunny
    [humidity]
--------High
            No
--------Normal
            Yes
----Overcast
        Yes
----Rain
    [wind]
--------Weak
            Yes
--------Strong
            No


In [246]:
print(id3(play_tennis, "play").children['Sunny'].children['High'].label)

No


In [247]:
test=play_tennis[0:3]

In [248]:
test_X = test.drop('play',axis=1)

In [249]:
def check_tree(node,data,index,result):
    if node.label is not None: 
        result.append(node.getLabel())
    else:
        for i in node.children:
            if i==data.loc[index,node.attribute]:
                check_tree(node.children[i],data,index,result)

In [250]:
def pred(data,model):
    result = []
    for i in range(len(data)):
        check_tree(model,data[i:i+1],i,result)
    return pd.DataFrame(result)
    

In [251]:
model=id3(play_tennis, "play")
res = pred(test_X,model)

In [252]:
res

Unnamed: 0,0
0,No
1,No
2,Yes


In [253]:
def accuracy(pred,data,target_attribute):
    cnt = 0
    for i in range(len(pred)):
        if pred.loc[i] == data.loc[i]:
            cnt+=1
    return cnt*100/len(pred)

In [254]:
accuracy(play_tennis['play'],x,'play')

35.714285714285715