<a href="https://colab.research.google.com/github/seoyeon7/ML/blob/main/Decision_Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [186]:
import math
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [187]:
#iris 데이터셋 불러오기

data = pd.read_csv('/content/iris.data',header=None, names=['sepal_len','sepal_width','petal_len','petal_width','class'])

In [188]:
# 범주형 알고리즘 사용을 위해 데이터 정수화
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(data['sepal_len'])
data["sepal_len"] = encoder.transform(data["sepal_len"])

encoder.fit(data['sepal_width'])
data['sepal_width'] = encoder.transform(data['sepal_width'])

encoder.fit(data['petal_len'])
data['petal_len'] = encoder.transform(data['petal_len'])

encoder.fit(data['petal_width'])
data['petal_width'] = encoder.transform(data['petal_width'])

encoder.fit(data['class'])
data['class'] = encoder.transform(data['class'])

In [189]:
#기술 속성(descriptive features)과 대상 속성 분류(target feature)
index=data[['sepal_len','sepal_width','petal_len','petal_width']]
target = data['class']
data.head()

Unnamed: 0,sepal_len,sepal_width,petal_len,petal_width,class
0,8,14,4,1,0
1,6,9,4,1,0
2,4,11,3,1,0
3,3,10,5,1,0
4,7,15,4,1,0


In [190]:
data = data.sample(frac=1).reset_index(drop=True)
data['seq'] = data.index
data.head()

Unnamed: 0,sepal_len,sepal_width,petal_len,petal_width,class,seq
0,33,17,41,18,2,0
1,3,13,4,2,0,1
2,14,4,26,16,2,2
3,8,17,5,2,0,3
4,21,6,29,15,2,4


In [191]:
#엔트로피 계산
def entropy(target_col):
    elements,counts = np.unique(target_col,return_counts = True)
    entropy = -np.sum([(counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

In [192]:
# 정보이득 계산

def Gain(data,gain_data,target_name="class"):
    # 전체 데이터 셋에 대한 엔트로피 계산
    entropy_all = entropy(data[target_name])
    
    # 정보이득을 구할 데이터 셋에 대한 엔트로피 계산
    vals,counts= np.unique(data[gain_data],return_counts=True)
    Entropy_tar = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[gain_data]==vals[i])) for i in range(len(vals))])
    
    #정보이득 계산
    Infogain = entropy_all - Entropy_tar
    return Infogain

In [193]:
from enum import unique
# ID3 알고리즘 사용
def ID3(data,original,features,target_name="class",parent_node = None):

  # 유일한 값을 가지는 경우 종료 후 속성 반환
  if len(np.unique(data[target_name]))<=1:
    return np.unique(data[target_name])[0]

  # 데이터가 없는 경우 원본 데이터 중 최대 값을 가지는 속성 반환
  elif len(data)==0:
    return np.unique(original[target_name])[np.argmax(np.unique(original[target_name],return_counts=True)[1])]

  # features 가 없을 경우 부모 노드의 속성으로 반환
  elif len(features)==0:
    return parent_node
  
  # 부모 노드의 대상 속성 정의
  else:
    parent_node = np.unique(data[target_name])[np.argmax(np.unique(data[target_name],return_counts=True)[1])]

    # 데이터를 분할할 속성 선택
    values = [Gain(data,feature,target_name) for feature in features]
    best_index = np.argmax(values)
    best_feature = features[best_index]

    # 트리 구조 생성
    tree={best_feature:{}}

    # 최대 정보 이득인 속성 제외
    features = [i for i in features if i !=best_feature]

    # 데이터 분할 / 결측값 제거
    for value in np.unique(data[best_feature]):
      sub_data = data.where(data[best_feature] == value).dropna()

      subtree = ID3(sub_data, data, features, target_name, parent_node)
      tree[best_feature][value] = subtree

    return (tree)

In [194]:
def predict(query, tree, default=1):
  for key in list(query.keys()):
    if key in list(tree.keys()):
      try:
        result = tree[key][query[key]] 
      except:
        return default
     
      result = tree[key][query[key]]
      
      if isinstance(result,dict):
          return predict(query,result)
      else:
          return result

In [195]:
def split(dataset):
    train_size = int(data.shape[0]*0.80)
    test_size = int(data.shape[0]*0.20)
    
    traindata = dataset.iloc[test_size:].reset_index(drop=True)
    testdata = dataset.iloc[:test_size].reset_index(drop=True)
    return traindata,testdata

train = split(data)[0]
test = split(data)[1] 

In [196]:
def testset(data,tree):
    
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    predicted = pd.DataFrame(columns=["predicted"]) 
    
    # 정확도 계산
    for i in range(len(data)):
        predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0) 
    print('The prediction accuracy is: ',(np.sum(predicted["predicted"] == data["class"])/len(data))*100,'%')

In [197]:
from pprint import pprint

tree = ID3(train,train,train.columns[:-1])
pprint(tree)

{'petal_len': {0: 0.0,
               2: 0.0,
               3: 0.0,
               4: 0.0,
               5: 0.0,
               6: 0.0,
               7: 0.0,
               8: 0.0,
               9: 1.0,
               10: 1.0,
               11: 1.0,
               12: 1.0,
               13: 1.0,
               14: 1.0,
               15: 1.0,
               16: 1.0,
               17: 1.0,
               19: 1.0,
               20: 1.0,
               21: {'sepal_width': {1.0: 1.0,
                                    4.0: 2.0,
                                    8.0: 1.0,
                                    9.0: 1.0,
                                    11.0: 1.0,
                                    13.0: 1.0}},
               22: 1.0,
               23: 1.0,
               24: {'sepal_len': {16.0: 1.0, 17.0: 2.0, 19.0: 2.0, 25.0: 1.0}},
               25: {'sepal_width': {4.0: 1.0,
                                    6.0: 2.0,
                                    7.0: 2.0,
       

In [198]:
testset(test,tree)

The prediction accuracy is:  90.0 %
