#### 붓꽃 품종 분류
- 목표 : 붓꽃의 3개 품종을 분류하기
- 데이터셋 :  내장 데이터셋
- 피쳐 : 4개
- 타겟 : 품종 1개
- 학습 : 지도학습 > 분류

In [114]:
### 1. 데이터 준비
# 모듈로딩
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

In [115]:
# 내장 데이터 셋 로딩
data = load_iris(as_frame=True)

In [116]:
# Bunch인스턴스 -> dict와 유사한 형태
data.keys() 

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [117]:
featureDF = data['data']
targetSR = data['target']

In [118]:
featureDF.shape, targetSR.shape

((150, 4), (150,))

In [119]:
featureDF.head(1), targetSR.head(1)

(   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 0                5.1               3.5                1.4               0.2,
 0    0
 Name: target, dtype: int32)

##### 2. 학습을 위한 데이터 셋 준비
- 필요한 것 = 학습용, 테스트용, 검증용 필요

In [120]:
# 학습용 & 테스트용 분리
x_train, x_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    stratify=targetSR)

In [121]:
# 학습용 & 검증용 분리
x_train, x_Val, y_train, y_Val = train_test_split(x_train,
                                                y_train,
                                                stratify=y_train)

In [122]:
print(f'Train DS : {x_train.shape[0]}   {x_train.shape[0]/featureDF.shape[0]}%')
print(f'Val DS : {x_Val.shape[0]}   {x_Val.shape[0]/featureDF.shape[0]:.2f}%')
print(f'test DS : {x_test.shape[0]}   {x_test.shape[0]/featureDF.shape[0]:.2f}%')

Train DS : 84   0.56%
Val DS : 28   0.19%
test DS : 38   0.25%


책 p105~107 참고 
- 회귀는 필요없고 분류는 필요 [KFold]

##### 3. 교차검증 방식

In [123]:
# 모듈 로딩
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [124]:
# 모델 인스턴스 생성
dtc_model = DecisionTreeClassifier()

In [125]:
# 3-1 KFold 기반
# 정확도 저장 리스트 
accuracys = []

In [126]:
# KFold 인스턴스 생성 [기본 k=5]
kfold=KFold()

In [127]:
## 학습
# k번 만큼 k개 데이터셋으로 학습 진행
# k등분 후 학습용 데이터셋 인덱스, 검증용 데이터셋 인덱스
for idx, (train_index, val_index) in enumerate(kfold.split(featureDF)):
 

    print(f'train_index : {train_index.tolist()}')

    # X_train, X_val 데이터셋 설정
    x_train, y_train = featureDF.iloc[train_index.tolist()], targetSR[train_index.tolist()]
    x_val, y_val = featureDF.iloc[val_index.tolist()], targetSR[val_index.tolist()]

    # 학습진행
    dtc_model.fit(x_train, y_train)

    # 평가 : 분류의 경우 score()메서드 => 정확도 반환
    train_acc = dtc_model.score(x_train, y_train)
    accuracy = dtc_model.score(x_val,y_val)

    accuracys.append([train_acc, accuracy])
    print(f'[{idx}번째] train 정확도 : {train_acc},  val 정확도: {accuracy:.2f}')
    

train_index : [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
[0번째] train 정확도 : 1.0,  val 정확도: 1.00
train_index : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 12

In [128]:
accuracys

[[1.0, 1.0],
 [1.0, 1.0],
 [1.0, 0.8666666666666667],
 [1.0, 0.9333333333333333],
 [1.0, 0.8]]

In [129]:
# 평균 계산
train_mean = sum ( [value[0] for value in accuracys])/ kfold.n_splits
test_mean = sum ( [value[1] for value in accuracys])/ kfold.n_splits

print(f'Train 정확도 : {train_mean},  val 정확도 : {test_mean:.2f}')

Train 정확도 : 1.0,  val 정확도 : 0.92


#### ===> 3-2. StratifiedKFold

In [130]:
#### ===> 3-2. StratifiedKFold : 정답/ 레이블/ 타겟의 비율을 고려해서 데이터 나눔

accuracys = []
skfold = StratifiedKFold()

# k번 만큼 k개 데이터셋으로 학습 진행
# -> k등분 후 학습용 데이터셋 인덱스, 검증용 데이터셋 인덱스
for idx, (train_index, val_index) in enumerate(skfold.split(featureDF,targetSR),1):

    # X_train, X_val 데이터셋 설정
    x_train, y_train = featureDF.iloc[train_index.tolist()], targetSR[train_index.tolist()]
    x_val, y_val = featureDF.iloc[val_index.tolist()], targetSR[val_index.tolist()]

    # 학습진행
    dtc_model.fit(x_train, y_train)

    # 평가 : 분류의 경우 score()메서드 => 정확도 반환
    train_acc = dtc_model.score(x_train, y_train)
    accuracy = dtc_model.score(x_val,y_val)

    accuracys.append([train_acc, accuracy])
    print(f'[{idx}번째] train 정확도 : {train_acc},  val 정확도: {accuracy:.2f}')
    

[1번째] train 정확도 : 1.0,  val 정확도: 0.97
[2번째] train 정확도 : 1.0,  val 정확도: 0.97
[3번째] train 정확도 : 1.0,  val 정확도: 0.90
[4번째] train 정확도 : 1.0,  val 정확도: 1.00
[5번째] train 정확도 : 1.0,  val 정확도: 1.00


In [131]:
# 평균 계산
train_mean = sum ( [value[0] for value in accuracys])/ skfold.n_splits
test_mean = sum ( [value[1] for value in accuracys])/ skfold.n_splits

print(f'Train 정확도 : {train_mean},  val 정확도 : {test_mean:.2f}')

Train 정확도 : 1.0,  val 정확도 : 0.97


#### 교차 검증 및 성능 평가 동시 진행 함수 
    - => cross_val_score,cross_val_predict
    - => cross_validate
    

In [132]:
from sklearn.model_selection import cross_val_predict,cross_val_score,cross_validate

In [133]:
# 전체 DS ==> 학습용 과 테스트용 DS 분리
x_train, x_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    stratify=targetSR)

In [134]:
cross_val_predict(x_train, y_train)

TypeError: Cannot clone object '     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
19                 5.1               3.8                1.5               0.3
87                 6.3               2.3                4.4               1.3
12                 4.8               3.0                1.4               0.1
94                 5.6               2.7                4.2               1.3
78                 6.0               2.9                4.5               1.5
..                 ...               ...                ...               ...
21                 5.1               3.7                1.5               0.4
32                 5.2               4.1                1.5               0.1
26                 5.0               3.4                1.6               0.4
76                 6.8               2.8                4.8               1.4
145                6.7               3.0                5.2               2.3

[112 rows x 4 columns]' (type <class 'pandas.core.frame.DataFrame'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

In [None]:
cross_val_score(x_train, y_train)

In [135]:
result = cross_validate(dtc_model, x_train, y_train
                        return_train_score = True,
                        return_estimator = True)

SyntaxError: invalid syntax (2416119574.py, line 2)

In [136]:
#### cross_val_predict
predict = cross_val_predict(dtc_model, featureDF, targetSR)


In [137]:
cross_val_score(dtc_model, featureDF, targetSR)

array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ])

In [138]:
result = cross_validate(dtc_model, featureDF, targetSR,
                        cv =10, # 기본 5개
                        return_train_score = True,
                        return_estimator = True)

In [139]:
result

{'fit_time': array([0.00298858, 0.00191021, 0.        , 0.        , 0.        ]),
 'score_time': array([0.00099659, 0.        , 0.        , 0.        , 0.01571178]),
 'estimator': [DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier()],
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}

In [140]:
resultDF = pd.DataFrame(result).loc[:,['test_score', 'train_score']]

In [141]:
resultDF # 3번째 4번째 모델이 가장 좋다 >>

Unnamed: 0,test_score,train_score
0,0.966667,1.0
1,0.966667,1.0
2,0.9,1.0
3,0.966667,1.0
4,1.0,1.0


In [142]:
# 최적화된 모델 추출

In [143]:
best_model = resultDF['estimator'][1]

KeyError: 'estimator'

In [None]:
테스트 데이터로 확인