# 연습문제 3번

## Decision Tree ,  Information entropy

### 4.3 Data Load

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

data_dir = Path('./data/chapter4_dataset.csv')
df = pd.read_csv(data_dir)

In [10]:
df = df.drop(['Idx'], axis=1)

In [11]:
df

Unnamed: 0,color,root,knocks,texture,navel,touch,density,sugar_ratio,label
0,dark_green,curl_up,little_heavily,distinct,sinking,hard_smooth,0.697,0.46,1
1,black,curl_up,heavily,distinct,sinking,hard_smooth,0.774,0.376,1
2,black,curl_up,little_heavily,distinct,sinking,hard_smooth,0.634,0.264,1
3,dark_green,curl_up,heavily,distinct,sinking,hard_smooth,0.608,0.318,1
4,light_white,curl_up,little_heavily,distinct,sinking,hard_smooth,0.556,0.215,1
5,dark_green,little_curl_up,little_heavily,distinct,little_sinking,soft_stick,0.403,0.237,1
6,black,little_curl_up,little_heavily,little_blur,little_sinking,soft_stick,0.481,0.149,1
7,black,little_curl_up,little_heavily,distinct,little_sinking,hard_smooth,0.437,0.211,1
8,black,little_curl_up,heavily,little_blur,little_sinking,hard_smooth,0.666,0.091,0
9,dark_green,stiff,clear,distinct,even,soft_stick,0.243,0.267,0


## custom Decision Tree

### training data

In [77]:
class Node:
    def __init__(self, label=None):
        self.feature_name = None
        self.t = None
        
        self.feature_val_lst = []
        self.child_node_lst = []
        
        self.label = label

In [108]:
import math

def AllEqual(D, A):
    for att in A:
        if len(D[att].unique()) > 1:
            return False
    return True

def Entropy(D):
    ent = 0
    length = len(D)
    
    for count in D['label'].value_counts():
        p = count / length
        ent -= p * math.log(p, 2)
    return ent

def Gain(D, a):
    gain = Entropy(D)
    length = len(D)
    
    for att_val in D[a].unique():
        D_v = D[D[a] == att_val]
        gain -= Entropy(D_v) * len(D_v) / length
    
    return gain

def Gain_float(D, a):
    a_val_lst = np.sort(D[a].unique())
    gain_lst = []  # (gain, t)
    length = len(D)
    
    for i in range(len(a_val_lst) - 1):
        t = (a_val_lst[i] + a_val_lst[i+1]) / 2
        gain = Entropy(D)
        
        for i in range(2):
            D_t = D[(-1)**i * D[a] < (-1)**i * t]
            gain -= Entropy(D_t) * len(D_t) / length
        gain_lst += [(gain, t)]
    
    return max(gain_lst, key=lambda x: x[0])

        
def BestAtt(D, A):   
    gain_lst = [] # (gain, attribute, t(연속속성만))
    
    for att in A:
        if D[att].dtype == 'float64':
            gain, t = Gain_float(D, att)
            gain_lst += [(gain, att, t)]
        else:
            gain = Gain(D, att)
            gain_lst += [(gain, att, None)]
            
    return max(gain_lst, key=lambda x: x[0])
    

def TreeGenerate(D, A):  # D: data, A: 속성  A_type: set 
    node = Node()
    
    if len(D['label'].unique()) == 1:
        node.label = D['label'].unique()[0]
        return node
    if AllEqual(D, A):
        node.label = D['label'].value_counts().idxmax()
        return node
    
    _, best_att, t = BestAtt(D, A)
    
    node.feature_name = best_att
    node.t = t
    
    if t:
        for i in range(2):
            D_v = D[(-1)**i * D[best_att] < (-1)**i * t]
            
            node.feature_val_lst += [i]
            node.child_node_lst += [TreeGenerate(D_v, A)]
    else:
        for att_val in D[best_att].unique():
            D_v = D[D[best_att] == att_val]
            node.feature_val_lst += [att_val]
            
            if len(D_v) == 0:
                child_node = Node(label=D['label'].value_counts().idxmax())
                node.child_node_lst += [child_node]
            else:
                node.child_node_lst += [TreeGenerate(D_v, A - {best_att})]
    return node

In [109]:
root = TreeGenerate(df, set(df.columns) - {'label'})

In [113]:
root.child_node_lst[0].child_node_lst[1].label

1

### predict

In [152]:
def predict_tree(data, root):
    label = root.label
    for i in range(len(root.feature_val_lst)):
        feature_val = root.feature_val_lst[i]
        
        if root.t:
            if (-1)**feature_val * data[root.feature_name].values[0] < (-1)**feature_val * root.t:
                label = predict_tree(data, root.child_node_lst[i])
                break
        else:
            if data[root.feature_name].values[0] == feature_val:
                label = predict_tree(data, root.child_node_lst[i])
                break
    return label

In [156]:
predict_tree(df.iloc[1:2], root)

1

In [155]:
predict_tree(df.iloc[11:12], root)

0