<a href="https://colab.research.google.com/github/rjsdn2308/machine-learning-practice/blob/main/250416_cross_validation_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [None]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [None]:
# wine 처음 5개 행 데이터 확인
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [None]:
# wine 전체 행의 개수 확인
print(wine,len(wine))

      alcohol  sugar    pH  class
0         9.4    1.9  3.51    0.0
1         9.8    2.6  3.20    0.0
2         9.8    2.3  3.26    0.0
3         9.8    1.9  3.16    0.0
4         9.4    1.9  3.51    0.0
...       ...    ...   ...    ...
6492     11.2    1.6  3.27    1.0
6493      9.6    8.0  3.15    1.0
6494      9.4    1.2  2.99    1.0
6495     12.8    1.1  3.34    1.0
6496     11.8    0.8  3.26    1.0

[6497 rows x 4 columns] 6497


In [None]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [None]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [None]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [None]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [None]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


## 교차 검증

In [None]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.00862312, 0.00969887, 0.00864172, 0.00833845, 0.00990725]), 'score_time': array([0.0013485 , 0.0012753 , 0.00123119, 0.00117111, 0.00144601]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [None]:
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [None]:
gs.fit(train_input, train_target)

In [None]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [None]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [None]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [None]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [None]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [None]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [None]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [None]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.01226125, 0.01055703, 0.00879521, ..., 0.00689254, 0.01136703,
       0.01111293])

### 랜덤 서치

In [None]:
from scipy.stats import uniform, randint

In [None]:
# 균등 분포 샘플링
rgen = randint(0, 10)
rgen.rvs(10)

array([7, 1, 5, 9, 9, 8, 0, 6, 2, 9])

In [None]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 99, 114,  84,  93, 107, 101,  98,  93, 109, 102]))

In [None]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.32715719, 0.39121332, 0.94679957, 0.1165579 , 0.41658759,
       0.93253991, 0.3654998 , 0.56467226, 0.05150648, 0.60580049])

In [None]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

In [None]:
print(rs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [None]:
print(np.max(rs.cv_results_['mean_test_score']))

0.8695428296438884


In [None]:
dt = rs.best_estimator_

print(dt.score(test_input, test_target))

0.86


In [None]:
rs.cv_results_['mean_fit_time']

array([0.0302949 , 0.03161325, 0.02659397, 0.0212039 , 0.0132566 ,
       0.01628833, 0.00904408, 0.0092598 , 0.01473298, 0.02466807,
       0.01551538, 0.0066103 , 0.01471825, 0.01142607, 0.01390157,
       0.01290131, 0.01369004, 0.01523304, 0.01751723, 0.01596427,
       0.01325254, 0.00729432, 0.01317983, 0.02658486, 0.01459808,
       0.02117772, 0.02386112, 0.0114243 , 0.0140327 , 0.0164237 ,
       0.016329  , 0.01443052, 0.01947365, 0.01694746, 0.01256905,
       0.00758657, 0.00695734, 0.0148159 , 0.01848526, 0.01325288,
       0.00782542, 0.00727901, 0.01067586, 0.01557226, 0.009586  ,
       0.01417532, 0.00766964, 0.0100338 , 0.01052237, 0.01554008,
       0.00980129, 0.00783691, 0.01164298, 0.01513834, 0.00826211,
       0.00719976, 0.00840406, 0.01893163, 0.01616688, 0.00972075,
       0.01562099, 0.0068594 , 0.00679379, 0.01282935, 0.00695753,
       0.00977449, 0.00691614, 0.02300448, 0.01488643, 0.01351075,
       0.00687575, 0.00857301, 0.00824137, 0.01322923, 0.00926

In [None]:
print(np.mean(rs.cv_results_['mean_fit_time']))

0.013076673507690434


### 결정트리 분할 옵션 변경

In [None]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs2.fit(train_input, train_target)

In [None]:
print(rs2.best_params_)
print(np.max(rs2.cv_results_['mean_test_score']))

dt = rs2.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [None]:
rs2.cv_results_['mean_fit_time']

array([0.00548592, 0.00502524, 0.00879078, 0.00509176, 0.00321302,
       0.00362606, 0.00457344, 0.00527225, 0.00653648, 0.00486279,
       0.00605574, 0.00357766, 0.00334029, 0.003193  , 0.00687747,
       0.00626926, 0.00449667, 0.00909214, 0.00566039, 0.00788536,
       0.00410299, 0.00615487, 0.00806122, 0.00331979, 0.00475416,
       0.00365248, 0.0043283 , 0.00627308, 0.00816278, 0.00777731,
       0.00501804, 0.0069097 , 0.00898447, 0.00488858, 0.00803556,
       0.007512  , 0.00849857, 0.00884848, 0.00586348, 0.00320425,
       0.00352254, 0.00686021, 0.00680594, 0.00410166, 0.00827084,
       0.00880685, 0.00512381, 0.00330462, 0.00328445, 0.0057219 ,
       0.00698185, 0.00630255, 0.00318284, 0.00295844, 0.00340943,
       0.00327654, 0.00334382, 0.00346017, 0.00375032, 0.00338097,
       0.00358143, 0.0031354 , 0.00407801, 0.00333838, 0.0032505 ,
       0.00327792, 0.00295839, 0.00340557, 0.00317917, 0.00318685,
       0.00330358, 0.00361691, 0.0032392 , 0.00313272, 0.00325

In [None]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.004693257331848144


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.
차이점 1. random splitter는 빠르지만 정확도가 낮은 경향이 있습니다.[splitter='random' 옵션 사용 여부]
차이점 2. 모델 성능(정확도)과 학습 시간(fit time)의 차이