<a href="https://colab.research.google.com/github/rbdus0715/kaggle/blob/main/study/sklearn/cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **K-Fold**
    일반화 능력을 정확히 측정하는데 도움
    데이터를 최대한 활용할 수 있음
    계산 비용, 시간 소요
    데이터의 불균형 : 데이터셋이 불균형할 경우 각 폴드의 모델의 성능이 다를 수 있음

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np

In [2]:
iris = load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state=156)

kfold = KFold(n_splits=5)
cv_accuracy = [] # 세트별 정확도를 담을 리스트

n_iter = 0

## kfold.split
# 파라미터 : train 데이터
# return : 나눠진 train, test 데이터의 인덱스
##
for train_index, test_index, in kfold.split(features):
    X_train, X_test = features[train_index], features[test_index] # 인덱스로 변환 가능한 표현식
    y_train, y_test = label[train_index], label[test_index]

    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    n_iter += 1

    accuracy = np.round(accuracy_score(y_test, pred), 4) # accuracy_score(정답, 예측)
    cv_accuracy.append(accuracy)

print(np.mean(cv_accuracy))

0.9


## **Stratified K fold**
    불균형한 클래스 분포에 강건한 성능 평가
    샘플링 오버헤드 : 데이터 샘플링에서 발생하는 추가적 비용 혹은 부담
        - 추가적인 계산 비용
        - 데이터 중복

In [3]:
from sklearn.model_selection import StratifiedKFold

In [4]:
skf = StratifiedKFold(n_splits=3)
n_iter = 0
cv_accuracy = []

for train_index, test_index in skf.split(features, label):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]

    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)

    n_iter += 1

    accuracy = np.round(accuracy_score(y_test, pred), 4)
    cv_accuracy.append(accuracy)

print(np.round(np.mean(cv_accuracy), 4))

0.9667


## **Cross-validation 간편하게 하기**

In [19]:
from sklearn.model_selection import cross_val_score, cross_validate

In [20]:
scores = cross_val_score(dt_clf, features, label, scoring='accuracy', cv=3)
print(np.round(np.mean(scores), 4))

0.9667


## **GridSearchCV - 교차 검증 + 하이퍼 파라미터 튜닝**

In [21]:
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd

iris_data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size=0.2, random_state=121)

dtree = DecisionTreeClassifier()

parameters = {
    'max_depth' : [1, 2, 3],
    'min_samples_split' : [2, 3]
}

grid_dtree = GridSearchCV(dtree, param_grid=parameters, cv=3, refit=True)
grid_dtree.fit(X_train, y_train)

In [22]:
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


In [23]:
print(grid_dtree.best_params_)
print(grid_dtree.best_score_)

estimator = grid_dtree.best_estimator_

pred = estimator.predict(X_test)

print(accuracy_score(y_test, pred))

{'max_depth': 3, 'min_samples_split': 2}
0.975
0.9666666666666667


In [24]:
estimator = grid_dtree.best_estimator_

pred = estimator.predict(X_test)

accuracy_score(y_test, pred)

0.9666666666666667