<a href="https://colab.research.google.com/github/rms5010/machine-learning-practice/blob/main/250416_cross_validation_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [92]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [93]:
# wine 처음 5개 행 데이터 확인
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [94]:
# wine 전체 행의 개수 확인
print(wine.shape[0])

6497


In [95]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [96]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [97]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy() # 'alcohol', 'sugar', 'pH' 세 개의 특성만 선택하여 넘파이 배열로 변환
target = wine['class'].to_numpy() # 'class' 열(타깃값)을 넘파이 배열로 변환

In [98]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42) # test_size=0.2: 전체 데이터의 20%를 테스트용으로 사용, random_state=42: 랜덤 분할 결과를 고정

In [99]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42) # sub_input: 최종 훈련용 데이터, val_input: 모델 튜닝이나 평가용 검증 데이터

In [100]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [101]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target)) # 훈련 정확도
print(dt.score(val_input, val_target)) # 검증 정확도

0.9971133028626413
0.864423076923077


## 교차 검증

In [102]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.00898433, 0.00816822, 0.00806355, 0.00820732, 0.00784278]), 'score_time': array([0.00137258, 0.00100183, 0.00089812, 0.00089359, 0.00088143]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [103]:
import numpy as np

print(np.mean(scores['test_score'])) # 테스트 세트의 평균 정확도 출력

0.855300214703487


In [104]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold()) # 교차 검증 수행 (StratifiedKFold 사용)
print(np.mean(scores['test_score'])) # 테스트 세트의 평균 정확도 출력

0.855300214703487


In [105]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter) # 결정 트리 모델을 사용하여 교차 검증 수행
print(np.mean(scores['test_score'])) # 평균 테스트 점수 출력

0.8574181117533719


## 하이퍼파라미터 튜닝

In [106]:
from sklearn.model_selection import GridSearchCV # 값이 작아질수록 더 복잡해짐

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [107]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)# n_jobs=-1: 가능한 모든 CPU 코어를 사용하여 병렬 처리, 결정 트리를 사용하여 하이퍼파라미터 탐색

In [108]:
gs.fit(train_input, train_target)

In [109]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [110]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [111]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [112]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [113]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001), # 넘파이로 만든 만든 배열로 그리드 서치
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [114]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [115]:
print(gs.best_params_) # 그리드서치를 통해 찾은 최적의 하이퍼파라미터 조합 출력

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [116]:
print(np.max(gs.cv_results_['mean_test_score'])) # 교차 검증에서 얻은 평균 테스트 점수들 중 가장 높은 값을 출력

0.8683865773302731


In [117]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.01632571, 0.0159234 , 0.01795468, ..., 0.00652347, 0.00663581,
       0.0065546 ])

### 랜덤 서치

In [118]:
from scipy.stats import uniform, randint

In [119]:
# 균등 분포 샘플링
rgen = randint(0, 10)
rgen.rvs(10)

array([7, 6, 9, 3, 8, 5, 9, 6, 0, 9])

In [120]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 86, 104, 104,  99,  91,  96,  95, 107, 111, 107]))

In [121]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.94438885, 0.70444166, 0.27649947, 0.64021977, 0.4357775 ,
       0.5206377 , 0.22785547, 0.11047532, 0.92060014, 0.77707058])

In [122]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          } # 범위만 지정

In [123]:
from sklearn.model_selection import RandomizedSearchCV # 랜덤 서치

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

In [124]:
print(rs.best_params_) # 최적 파라미터 값 출력

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [125]:
print(np.max(rs.cv_results_['mean_test_score'])) # 스코어 값 출력

0.8695428296438884


In [126]:
dt = rs.best_estimator_

print(dt.score(test_input, test_target))

0.86


In [127]:
rs.cv_results_['mean_fit_time'] # 학습 시간

array([0.0067275 , 0.00697408, 0.00781302, 0.00789714, 0.00728765,
       0.00910039, 0.00721598, 0.0074039 , 0.00727444, 0.0069262 ,
       0.00668588, 0.00691481, 0.00784502, 0.00713649, 0.00662732,
       0.00725541, 0.00688996, 0.00757303, 0.0077795 , 0.00812583,
       0.00983562, 0.00692105, 0.00808167, 0.0069088 , 0.0072794 ,
       0.00781689, 0.00671988, 0.00687442, 0.0067647 , 0.00655713,
       0.00652289, 0.00669055, 0.006392  , 0.0073905 , 0.0076654 ,
       0.00685906, 0.00656371, 0.0081697 , 0.00648179, 0.00663781,
       0.0066905 , 0.00713243, 0.00646272, 0.00772071, 0.00767417,
       0.00669556, 0.00720272, 0.00669327, 0.00740309, 0.00679388,
       0.0077868 , 0.00736275, 0.00637918, 0.00632577, 0.00647874,
       0.00684233, 0.0065917 , 0.00803528, 0.00659547, 0.00659728,
       0.00869265, 0.00656395, 0.00658836, 0.00731902, 0.00623212,
       0.01101899, 0.00644608, 0.00675588, 0.00738502, 0.00697093,
       0.00637655, 0.00773144, 0.00694013, 0.0063889 , 0.00665

In [128]:
print(np.mean(rs.cv_results_['mean_fit_time'])) # 학습 시간 평균

0.007187811374664307


### 결정트리 분할 옵션 변경

In [129]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs2.fit(train_input, train_target)

In [130]:
print(rs2.best_params_)
print(np.max(rs2.cv_results_['mean_test_score']))

dt = rs2.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [131]:
rs2.cv_results_['mean_fit_time']

array([0.00328364, 0.00336056, 0.00350776, 0.00391111, 0.00335665,
       0.00336261, 0.00295291, 0.00290003, 0.00457416, 0.00309525,
       0.00304012, 0.00405645, 0.00411787, 0.00512128, 0.00310154,
       0.0040946 , 0.00633864, 0.00320177, 0.00467911, 0.00310917,
       0.00317039, 0.0029603 , 0.00305595, 0.00289168, 0.00292859,
       0.00324998, 0.00310287, 0.00317698, 0.00297642, 0.00299683,
       0.00285845, 0.00313468, 0.0027595 , 0.00421014, 0.00325584,
       0.00306635, 0.00299129, 0.00462012, 0.00296021, 0.00290475,
       0.00292234, 0.00322728, 0.00304193, 0.00296779, 0.00308557,
       0.00372334, 0.00300493, 0.00299897, 0.00299501, 0.00307364,
       0.0042274 , 0.00293813, 0.00295711, 0.00277901, 0.00290551,
       0.00307441, 0.00325894, 0.00314927, 0.00295892, 0.00290594,
       0.00326829, 0.00281734, 0.00289288, 0.00421863, 0.00287604,
       0.00276551, 0.00274653, 0.00323915, 0.00296588, 0.00352225,
       0.00302   , 0.00291781, 0.00296836, 0.00290136, 0.00361

In [132]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.003265621185302734


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.

- 결정 트리 분할 방식 변경 	(spliter='best') → (splitter='random')

- 분할 방식이 바뀌면서 작업 속도가 빨라짐