#### 붓꽃 품종 분류
- 목표 : 붓꽃의 3개 품종을 분류하기
- 데이터셋 :  내장 데이터셋
- 피쳐 : 4개
- 타겟 : 품종 1개
- 학습 : 지도학습 > 분류

In [80]:
### 1. 데이터 준비
# 모듈로딩
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

In [81]:
# 내장 데이터 셋 로딩
data = load_iris(as_frame=True)

In [82]:
# Bunch 인스턴스 -> dict와 유사한 형태
data.keys() 

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [83]:
featureDF = data['data']
targetSR = data['target']

In [84]:
featureDF.shape, targetSR.shape

((150, 4), (150,))

In [85]:
featureDF.head(1)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2


In [86]:
targetSR.head(1)

0    0
Name: target, dtype: int32

##### 2. 학습을 위한 데이터 셋 준비
- 필요한 것 = 학습용, 테스트용, 검증용 필요

In [87]:
# 학습용 & 테스트용 분리
x_train, x_test, y_train, y_test = train_test_split(featureDF,
                                                    targetSR,
                                                    stratify=targetSR)
                                                    # 품종따라 비율 골고루 잘 나오게

In [88]:
# 학습용 & 검증용 분리
x_train, x_Val, y_train, y_Val = train_test_split(x_train,
                                                y_train,
                                                stratify=y_train)

In [89]:
print(f'Train DS : {x_train.shape[0]}   {x_train.shape[0]/featureDF.shape[0]}%')
print(f'Val DS : {x_Val.shape[0]}   {x_Val.shape[0]/featureDF.shape[0]:.2f}%')
print(f'test DS : {x_test.shape[0]}   {x_test.shape[0]/featureDF.shape[0]:.2f}%')

Train DS : 84   0.56%
Val DS : 28   0.19%
test DS : 38   0.25%


책 p105~107 참고 
- stratify => 회귀는 필요없고 분류는 필요
 >> 위와 같이 한 품종에서 뽑아버리면 다른 품종에서는 맞지않아 부정확   
  >> stratify을 사용해서 품종을 비율에 맞추어 골고루 분류 함

##### 3. 교차검증 방식

In [90]:
# 모듈 로딩 #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> KFold가의 형 동생 둘 차이가?????
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [91]:
# 모델 인스턴스 생성
dtc_model = DecisionTreeClassifier()

In [92]:
# 3-1 KFold 기반
# 정확도 저장 리스트 
accuracys = []

In [93]:
# KFold 인스턴스 생성 [기본 k=5]
kfold=KFold()

In [94]:
## 학습
# k번 만큼 k개 데이터셋으로 학습 진행
# k등분 후 학습용 데이터셋 인덱스, 검증용 데이터셋 인덱스
for idx, (train_index, val_index) in enumerate(kfold.split(featureDF)):
 

    print(f'train_index : {train_index.tolist()}')

    # X_train(훈련), X_val(검증) 데이터셋 설정
    x_train, y_train = featureDF.iloc[train_index.tolist()], targetSR[train_index.tolist()]
    x_val, y_val = featureDF.iloc[val_index.tolist()], targetSR[val_index.tolist()]

    # 학습진행
    dtc_model.fit(x_train, y_train)

    # 평가 : 분류의 경우 score()메서드 => 정확도 반환
    train_acc = dtc_model.score(x_train, y_train)
    accuracy = dtc_model.score(x_val,y_val)

    accuracys.append([train_acc, accuracy])
    print(f'[{idx}번째] train 정확도 : {train_acc},  val 정확도: {accuracy:.2f}')
    

train_index : [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
[0번째] train 정확도 : 1.0,  val 정확도: 1.00
train_index : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 12

In [95]:
accuracys

[[1.0, 1.0],
 [1.0, 1.0],
 [1.0, 0.8666666666666667],
 [1.0, 0.9333333333333333],
 [1.0, 0.7666666666666667]]

=> 2,3,4 번 과대적합!

In [96]:
# 평균 계산
train_mean = sum ( [value[0] for value in accuracys])/ kfold.n_splits
test_mean = sum ( [value[1] for value in accuracys])/ kfold.n_splits

print(f'Train 정확도 : {train_mean},  val 정확도 : {test_mean:.2f}')

Train 정확도 : 1.0,  val 정확도 : 0.91


#### ===> 3-2. StratifiedKFold

- 정답/ 레이블/ 타겟의 비율을 고려해서 데이터 나눔

In [97]:
# 정확도 저장 리스트
accuracys=[]

# KFold 인스턴스 생성
skfold=StratifiedKFold() # 괄호안에 안주면 default 5개

In [98]:
# k번 만큼 k개 데이터셋으로 학습 진행
# -> k등분 후 학습용 데이터셋 인덱스, 검증용 데이터셋 인덱스
for idx, (train_index, val_index) in enumerate(skfold.split(featureDF,targetSR),1):

    print(f'train_index : { train_index.tolist()}')

    # X_train, X_val 데이터셋 설정
    x_train, y_train = featureDF.iloc[train_index.tolist()], targetSR[train_index.tolist()]
    x_val, y_val = featureDF.iloc[val_index.tolist()], targetSR[val_index.tolist()]

    # 학습진행
    dtc_model.fit(x_train, y_train)

    # 평가 : 분류의 경우 score()메서드 => accuracy(정확도) 반환
    train_accuracy = dtc_model.score(x_train, y_train)
    accuracy = dtc_model.score(x_val,y_val)
    accuracys.append([train_acc, accuracy])
    print(f'[{idx}번째] train 정확도 : {train_accuracy},  val 정확도: {accuracy:.2f}')
    

train_index : [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
[1번째] train 정확도 : 1.0,  val 정확도: 0.97
train_index : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 13

In [99]:
# 평균 계산
train_mean = sum ( [value[0] for value in accuracys])/ skfold.n_splits
test_mean = sum ( [value[1] for value in accuracys])/ skfold.n_splits

print(f'Train 정확도 : {train_mean},  val 정확도 : {test_mean:.2f}')

Train 정확도 : 1.0,  val 정확도 : 0.97


#### 교차 검증 및 성능 평가 동시 진행 함수 
 - cross_val_score,cross_val_predict
 - cross_validate
    

In [100]:
from sklearn.model_selection import cross_val_predict,cross_val_score,cross_validate

In [101]:
### cross_val_predict
predict=cross_val_predict(dtc_model,featureDF,targetSR,cv=3)

In [102]:
print(f'predict:{predict}')

predict:[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [103]:
### cross_val_scroe
cross_val_score(dtc_model,featureDF,targetSR)

array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])

In [104]:
### cross_validate
result=cross_validate(dtc_model,featureDF,targetSR)

In [105]:
result

{'fit_time': array([0.00299048, 0.00199437, 0.        , 0.0028739 , 0.        ]),
 'score_time': array([0.00099587, 0.00128961, 0.0048337 , 0.00108242, 0.        ]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ])}

In [106]:
### cross_validate
result=cross_validate(dtc_model,featureDF,targetSR,return_train_score=True)


In [107]:
result

{'fit_time': array([0.00316787, 0.00312853, 0.00099421, 0.00100803, 0.00099683]),
 'score_time': array([0.        , 0.0009973 , 0.00099754, 0.00099611, 0.00199318]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}

In [108]:
### cross_validate
result=cross_validate(dtc_model,featureDF,targetSR,
                      return_train_score=True, 
                      return_estimator=True)

In [109]:
result

{'fit_time': array([0.        , 0.01050282, 0.00201726, 0.00299072, 0.00199389]),
 'score_time': array([0.        , 0.        , 0.00199366, 0.00199223, 0.00149465]),
 'estimator': [DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier()],
 'test_score': array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}

In [110]:
resultDF = pd.DataFrame(result).loc[:,['test_score', 'train_score']]

In [111]:
resultDF # 3번째 4번째 모델이 가장 좋다 >>

Unnamed: 0,test_score,train_score
0,0.966667,1.0
1,0.966667,1.0
2,0.9,1.0
3,1.0,1.0
4,1.0,1.0


##### 최적화된 모델 추출
result 안에 estimator= 모델을 꺼내주는것!  
score 를 보고 최적의 모델을 꺼내 쓸수있음!

In [112]:
bestmodel=result['estimator'][4]

In [113]:
bestmodel.predict(x_test)

array([2, 0, 1, 2, 0, 0, 1, 2, 0, 0, 2, 0, 1, 2, 2, 1, 2, 1, 1, 0, 2, 2,
       2, 1, 1, 0, 0, 1, 1, 2, 0, 0, 0, 2, 0, 1, 1, 2])

In [114]:
bestmodel.score(x_test,y_test)

1.0