In [1]:
## 사이킷런의 기초인 의사결정나무 모형부터 실행해보기로 한다. 데이터셋은 iris 데이터를 이용한다.

## 붓꽃 품종 예측하기
import sklearn
import pandas as pd
## 의사결정나무 트리 사용
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
# 데이터 로드
iris = load_iris()
# print(iris)
iris_data = iris.data
iris_label = iris.target

# 데이터프레임으로 변환
iris_df = pd.DataFrame(data = iris_data, columns = iris.feature_names)
iris_df['label'] = iris_label
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, 
                                                    test_size=0.2, random_state=11)

In [4]:
# DecisionTreeClassifier 객체 생성 
dt_clf = DecisionTreeClassifier(random_state=11)

# 학습 수행 
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=11)

In [5]:
# 예측값 도출
pred = dt_clf.predict(X_test)

In [6]:
## 정확도 측정하기
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,pred))

0.9333333333333333


In [7]:
## 데이터 세트를 30% 70%로 분리하기, Random state = 121로 설정
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, 
                                                    test_size=0.3, random_state=121)

dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
print(accuracy_score(y_test,pred)) ## 정확도는 0.955556 이 출력된다

0.9555555555555556


In [8]:
### 교차검증방법 // 새로 라이브러리를 임포트 하겠습니다.
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np

iris = load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state=156)

# 5개의 폴드로 분리하는 KFold 객체 생성 폴드 세트별 정확도를 담을 것
kfold = KFold(n_splits=5)
taccuracy = []

for train_index, test_index in kfold.split(features):
    X_train, X_test = features[train_index],features[test_index]
    y_train, y_test = label[train_index],label[test_index]
    dt_clf.fit(X_train,y_train)
    pred = dt_clf.predict(X_test)
    accuracy = np.round(accuracy_score(y_test,pred),4)
    taccuracy.append(accuracy)
    
print('각 폴드별 정확도',taccuracy)    

각 폴드별 정확도 [1.0, 0.9667, 0.8667, 0.9333, 0.7333]


In [9]:
## 계층 K 폴드
from sklearn.model_selection import StratifiedKFold
dt_clf = DecisionTreeClassifier(random_state=156)
skfold = StratifiedKFold(n_splits=3)
taccuracy=[]

for train_index, test_index in skfold.split(features,label):
    X_train, X_test = features[train_index],features[test_index]
    y_train, y_test = label[train_index],label[test_index]
    dt_clf.fit(X_train,y_train)
    pred = dt_clf.predict(X_test)
    accuracy = accuracy_score(y_test,pred)
    taccuracy.append(accuracy)
    
print('교차 검증별 정확도',taccuracy)

교차 검증별 정확도 [0.98, 0.94, 0.98]


In [10]:
# 교차 검증을 조금더 간편하게 cross_val_score()
# 폴드 세트 후 for 구문에서 예측수행했지만 그럴 필요가 없음.
from sklearn.model_selection import cross_val_score, cross_validate
scores = cross_val_score(dt_clf, features, label, scoring='accuracy', cv=3)
print(scores)

[0.98 0.94 0.98]


In [11]:
# 이번에는 그리드서치로 하이퍼파라미터까지 한번에 교정해보고자 한다.
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, 
                                                    test_size=0.2, random_state=121)
dt_clf = DecisionTreeClassifier()
### parameter 들을 dictionary 형태로 설정
parameters = {'max_depth':[1, 2, 3], 'min_samples_split':[2,3]}
grid = GridSearchCV(dt_clf, param_grid=parameters, cv=3, refit=True, return_train_score=True)
grid.fit(X_train,y_train)
scores = pd.DataFrame(grid.cv_results_)
scores[['params','mean_test_score','rank_test_score','split0_test_score','split1_test_score','split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


In [12]:
print(grid.best_params_)
print(grid.best_score_)
# 최고정확도는 0.975 , 최고의 파라미터는 3,2 이다.

{'max_depth': 3, 'min_samples_split': 2}
0.975
