# 연습문제 4번

## Decision Tree ,  Gini index ,  pruning  

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

data_dir = Path('./data/chapter4_dataset.csv')
df = pd.read_csv(data_dir)
df = df.drop(['density', 'sugar_ratio'], axis=1)

In [2]:
train_idx = [1, 2, 3, 6, 7, 10, 14, 15, 16, 17]
test_idx = [4, 5, 8, 9, 11, 12, 13]

In [3]:
train = df[np.isin(df['Idx'], train_idx)]
test = df[np.isin(df['Idx'], test_idx)]

In [4]:
train = train.drop(['Idx'], axis=1)
test = test.drop(['Idx'], axis=1)

## custom Decision Tree

### training data

In [6]:
class Node:
    def __init__(self, label=None):
        self.feature_name = None
        
        self.feature_val_lst = []
        self.child_node_lst = []
        
        self.label = label

In [28]:
import math

def AllEqual(D, A):
    for att in A:
        if len(D[att].unique()) > 1:
            return False
    return True

def Gini(D):
    gini = 1
    length = len(D)
    
    for count in D['label'].value_counts():
        p = count / length
        gini -= p * p
    return gini

def Gini_index(D, a):
    gain = 0
    length = len(D)
    
    for att_val in D[a].unique():
        D_v = D[D[a] == att_val]
        gain += Gini(D_v) * len(D_v) / length
    
    return gain
        
def BestAtt(D, A):   
    gini_index_lst = [] # (gain, attribute)
    
    for att in A:
        gini_index = Gini_index(D, att)
        gini_index_lst += [(gini_index, att)]
            
    return min(gini_index_lst, key=lambda x: x[0])
    

def TreeGenerate(D, A):  # D: data, A: 속성  A_type: set 
    node = Node()
    
    if len(D['label'].unique()) == 1:
        node.label = D['label'].unique()[0]
        return node
    if AllEqual(D, A):
        node.label = D['label'].value_counts().idxmax()
        return node
    
    _, best_att = BestAtt(D, A)
    
    node.feature_name = best_att
    
    for att_val in D[best_att].unique():
        D_v = D[D[best_att] == att_val]
        node.feature_val_lst += [att_val]

        if len(D_v) == 0:
            child_node = Node(label=D['label'].value_counts().idxmax())
            node.child_node_lst += [child_node]
            print(1)
        else:
            node.child_node_lst += [TreeGenerate(D_v, A - {best_att})]
    return node

In [29]:
root = TreeGenerate(train, set(train.columns) - {'label'})

### predict

In [15]:
def predict_tree(data, root):
    label = root.label
    for i in range(len(root.feature_val_lst)):
        feature_val = root.feature_val_lst[i]
        
        if data[root.feature_name].values[0] == feature_val:
            label = predict_tree(data, root.child_node_lst[i])
            break
    return label

In [19]:
pred_lst = []
for i in range(len(test)):
    pred = predict_tree(test.iloc[i:i+1], root)
    pred_lst += [pred]

In [20]:
pred_lst

[0, 0, 1, 1, 0, 0, 1]

In [25]:
acc = np.mean(np.array(pred_lst) == test['label'])

In [26]:
acc

0.42857142857142855

## pruning Decision Tree

### pre-pruning

In [5]:
class Node:
    def __init__(self, label=None):
        self.feature_name = None
        
        self.feature_val_lst = []
        self.child_node_lst = []
        
        self.label = label

In [46]:
import math

def AllEqual(D, A):
    for att in A:
        if len(D[att].unique()) > 1:
            return False
    return True

def Gini(D):
    gini = 1
    length = len(D)
    
    for count in D['label'].value_counts():
        p = count / length
        gini -= p * p
    return gini

def Gini_index(D, a):
    gain = 0
    length = len(D)
    
    for att_val in D[a].unique():
        D_v = D[D[a] == att_val]
        gain += Gini(D_v) * len(D_v) / length
    
    return gain
        
def BestAtt(D, A):   
    gini_index_lst = [] # (gain, attribute)
    
    for att in A:
        gini_index = Gini_index(D, att)
        gini_index_lst += [(gini_index, att)]
            
    return min(gini_index_lst, key=lambda x: x[0])

def correct_count(D):
    if D.empty:
        return 0
    return D['label'].value_counts().max()
    

def TreeGenerate_pre_pruning(D, A, D_test):  # D: data, A: 속성  A_type: set 
    node = Node()
    
    if len(D['label'].unique()) == 1:
        node.label = D['label'].unique()[0]
        return node
    if AllEqual(D, A) or D_test.empty:
        node.label = D['label'].value_counts().idxmax()
        return node
    
    _, best_att = BestAtt(D, A)
    
    node.feature_name = best_att
    D_v_lst = []
    D_v_test_lst = []
    
    for att_val in D[best_att].unique():
        D_v = D[D[best_att] == att_val]
        D_v_lst += [D_v]
        
        D_v_test = D_test[D_test[best_att] == att_val]
        D_v_test_lst += [D_v_test]
        
        node.feature_val_lst += [att_val]
    child_correct_sum = sum(list(map(correct_count, D_v_test_lst)))
    if correct_count(D) < child_correct_sum:
        for i in len(D_v_lst):
            node.child_node_lst += [TreeGenerate(D_v_lst[i], A-{best_att}, 
                                                D_v_test_lst[i])]
    else:
        node.feature_val_lst = []
        node.label = D['label'].value_counts().idxmax()
    
    return node

In [13]:
test['label'].value_counts().min()

4

In [43]:
a = test[test['label'] == 2]

In [45]:
[a, test]

[Empty DataFrame
 Columns: [color, root, knocks, texture, navel, touch, label]
 Index: [],
           color            root          knocks      texture           navel  \
 3    dark_green         curl_up         heavily     distinct         sinking   
 4   light_white         curl_up  little_heavily     distinct         sinking   
 7         black  little_curl_up  little_heavily     distinct  little_sinking   
 8         black  little_curl_up         heavily  little_blur  little_sinking   
 10  light_white           stiff           clear         blur            even   
 11  light_white         curl_up  little_heavily         blur            even   
 12   dark_green  little_curl_up  little_heavily  little_blur         sinking   
 
           touch  label  
 3   hard_smooth      1  
 4   hard_smooth      1  
 7   hard_smooth      1  
 8   hard_smooth      0  
 10  hard_smooth      0  
 11   soft_stick      0  
 12  hard_smooth      0  ]

In [40]:
a[0] 

Unnamed: 0,color,root,knocks,texture,navel,touch,label


In [36]:
b

nan

In [37]:
sum([1, b])

nan

In [83]:
import math

def AllEqual(D, A):
    for att in A:
        if len(D[att].unique()) > 1:
            return False
    return True

def Entropy(D):
    ent = 0
    length = len(D)
    
    for count in D['label'].value_counts():
        p = count / length
        ent -= p * math.log(p, 2)
    return ent

def Gain(D, a):
    gain = Entropy(D)
    length = len(D)
    
    for att_val in D[a].unique():
        D_v = D[D[a] == att_val]
        gain -= Entropy(D_v) * len(D_v) / length
    
    return gain
        
def BestAtt(D, A):   
    gain_lst = [] # (gain, attribute)
    
    for att in A:
        gain = Gain(D, att)
        gain_lst += [(gain, att)]
            
    return max(gain_lst, key=lambda x: x[0])

def correct_count(D):
    if D.empty:
        return 0
    return D['label'].value_counts().max()
    

def TreeGenerate_pre_pruning(D, A, D_test):  # D: data, A: 속성  A_type: set 
    node = Node()
    
    if len(D['label'].unique()) == 1:
        node.label = D['label'].unique()[0]
        return node
    if AllEqual(D, A) or D_test.empty:
        node.label = D['label'].value_counts().idxmax()
        return node
    
    _, best_att = BestAtt(D, A)
    print(best_att)
    
    node.feature_name = best_att
    D_v_lst = []
    D_v_test_lst = []
    
    for att_val in D[best_att].unique():
        D_v = D[D[best_att] == att_val]
        D_v_lst += [D_v]
        
        D_v_test = D_test[D_test[best_att] == att_val]
        D_v_test_lst += [D_v_test]
        
        node.feature_val_lst += [att_val]
        
    child_correct_sum = sum(list(map(correct_count, D_v_test_lst)))
    
    if correct_count(D_test) < child_correct_sum:
        for i in range(len(D_v_lst)):
            node.child_node_lst += [TreeGenerate_pre_pruning(D_v_lst[i], A-{best_att}, 
                                                D_v_test_lst[i])]
    else:
        node.feature_val_lst = []
        node.label = D['label'].value_counts().idxmax()
    
    return node

In [84]:
root = TreeGenerate_pre_pruning(train, set(train.columns) - {'label'}, test)

navel
root
touch


In [85]:
root.feature_val_lst[2]

'even'

In [86]:
root.child_node_lst[2].feature_name

In [81]:
train

Unnamed: 0,color,root,knocks,texture,navel,touch,label
0,dark_green,curl_up,little_heavily,distinct,sinking,hard_smooth,1
1,black,curl_up,heavily,distinct,sinking,hard_smooth,1
2,black,curl_up,little_heavily,distinct,sinking,hard_smooth,1
5,dark_green,little_curl_up,little_heavily,distinct,little_sinking,soft_stick,1
6,black,little_curl_up,little_heavily,little_blur,little_sinking,soft_stick,1
9,dark_green,stiff,clear,distinct,even,soft_stick,0
13,light_white,little_curl_up,heavily,little_blur,sinking,hard_smooth,0
14,black,little_curl_up,little_heavily,distinct,little_sinking,soft_stick,0
15,light_white,curl_up,little_heavily,blur,even,hard_smooth,0
16,dark_green,curl_up,heavily,little_blur,little_sinking,hard_smooth,0
