# 기본분류모델(Decision Tree)

## 라이브러리 로딩

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

## 데이터로딩

In [2]:
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['label'] = iris.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## 데이터 분할

In [3]:
X = iris_df.iloc[:, :4]
y = iris_df['label']
# 학습용 데이터와 테스트용 데이터로 나눔
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=42, 
                                                    stratify=y)

## 학습

In [4]:
# DecisionTreeClassifier 객체 생성
tree_model = DecisionTreeClassifier(random_state=42)
# 학습 수행
tree_model.fit(X_train, y_train)

## 평가

In [5]:
#테스트용 데이터로 모델 평가
score = tree_model.score(X_test, y_test)
score

0.8947368421052632

## 예측

In [6]:
import numpy as np
data = np.array([
[5.4, 4, 1.5, 0.2],
[6.2 , 2.7, 5.1, 1.6],
[6.5, 3.1 , 5.2, 2.]
])
y_pred = tree_model.predict(data)
y_pred



array([0, 1, 2])

# 교차검증(Cross validate)

In [7]:
from sklearn.model_selection import cross_validate
# cv: 3개의 train, test set fold 로 나누어 학습
scores = cross_validate(tree_model, X, y, cv=3, return_estimator=True)
scores

{'fit_time': array([0.0051229 , 0.0051887 , 0.00614786]),
 'score_time': array([0.        , 0.00720763, 0.        ]),
 'estimator': [DecisionTreeClassifier(random_state=42),
  DecisionTreeClassifier(random_state=42),
  DecisionTreeClassifier(random_state=42)],
 'test_score': array([0.98, 0.94, 0.96])}

In [9]:
# 3개의 분류기의 예측 결과
for i in range(3):
    print(scores['estimator'][i].predict(data))

[0 1 2]
[0 2 2]
[0 2 2]




# 하이퍼파라미터 최적화(GridSearchCV)

In [10]:
from sklearn.model_selection import GridSearchCV
# parameter를 dictionary 형태로 설정
parameters = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}
# param_grid: 하이퍼파라미터
# cv: 3개의 train, test set fold 로 나누어 학습
grid_trees = GridSearchCV(tree_model, param_grid=parameters, cv=3)
# param_grid의 하이퍼파라미터들을 순차적으로 학습
grid_trees.fit(X_train, y_train)

In [11]:
grid_trees.cv_results_

{'mean_fit_time': array([0.00657479, 0.00266552, 0.00162641, 0.00521191, 0.00520825,
        0.00523504]),
 'std_fit_time': array([0.00127141, 0.00376961, 0.00230009, 0.00737075, 0.00736558,
        0.00740346]),
 'mean_score_time': array([0.00153613, 0.00530704, 0.00266592, 0.        , 0.00545661,
        0.        ]),
 'std_score_time': array([0.00196626, 0.00375269, 0.00377018, 0.        , 0.00723638,
        0.        ]),
 'param_max_depth': masked_array(data=[1, 1, 2, 2, 3, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 1, 'min_samples_split': 2},
  {'max_depth': 1, 'min_samples_split': 3},
  {'max_depth': 2, 'min_samples_split': 2},
  {'max_depth': 2, 'min_samples_split': 3},
  {'max_depth': 3, 'min_sample

In [12]:
# GridSearchCV 결과 추출하여 DataFrame으로 변환
scores_df = pd.DataFrame(grid_trees.cv_results_)
scores_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006575,0.001271,0.001536,0.001966,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.657895,0.648649,0.675676,0.66074,0.011216,5
1,0.002666,0.00377,0.005307,0.003753,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.657895,0.648649,0.675676,0.66074,0.011216,5
2,0.001626,0.0023,0.002666,0.00377,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.947368,0.918919,0.945946,0.937411,0.013089,1
3,0.005212,0.007371,0.0,0.0,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.947368,0.918919,0.945946,0.937411,0.013089,1
4,0.005208,0.007366,0.005457,0.007236,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.921053,0.918919,0.945946,0.928639,0.012269,3
5,0.005235,0.007403,0.0,0.0,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.921053,0.918919,0.945946,0.928639,0.012269,3


In [13]:
print('GridSearchCV 최적 파라미터:', grid_trees.best_params_)
print('GridSearchCV 최고 정확도: {0:.4f}'.format(grid_trees.best_score_))
# 최고 성능을 낸 분류기
score = grid_trees.best_estimator_.score(X_test, y_test)
score

GridSearchCV 최적 파라미터: {'max_depth': 2, 'min_samples_split': 2}
GridSearchCV 최고 정확도: 0.9374


0.9210526315789473