<a href="https://colab.research.google.com/github/rjsdn2308/machine-learning-practice/blob/main/250416_cross_validation_hyperparameter_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 교차 검증과 그리드 서치

## 검증 데이터셋

In [None]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine-date')

### 문제 1 : wine 데이터 확인

In [None]:
# wine 처음 5개 행 데이터 확인
wine.?

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [None]:
# wine 전체 행의 개수 확인
print(wine.?)

(6497, 4)


In [None]:
# wine 데이터 통계값 확인 (각 특성별 평균, 표준편차, 최소값, 최대값 등)
wine.?

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


In [None]:
# 화이트 와인, 레드 와인 데이터 개수 확인
wine['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
1.0,4898
0.0,1599


### 데이터셋 분류

In [None]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

In [None]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [None]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [None]:
print(sub_input.shape, val_input.shape)

(4157, 3) (1040, 3)


In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


## 교차 검증

In [None]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.00972986, 0.0090487 , 0.00908279, 0.00876617, 0.01264167]), 'score_time': array([0.00177312, 0.00160956, 0.00150371, 0.00154233, 0.00270724]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [None]:
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [None]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


## 하이퍼파라미터 튜닝

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [None]:
gs.fit(train_input, train_target)

In [None]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [None]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [None]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [None]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [None]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [None]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [None]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': np.float64(0.0004), 'min_samples_split': 12}


In [None]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [None]:
# 교차검증 수행 시간 프린트
gs.cv_results_['mean_fit_time']

array([0.00405931, 0.00369716, 0.00387855, 0.00409913, 0.00412331,
       0.00497417, 0.00396147, 0.00350294, 0.00688505, 0.00350347,
       0.00343981, 0.00333652, 0.00365953, 0.00376692, 0.00972219,
       0.00503626, 0.00853009, 0.00394964, 0.00416293, 0.0034729 ,
       0.00874949, 0.00617785, 0.00399237, 0.0070591 , 0.00354743,
       0.00408497, 0.00361629, 0.00370383, 0.00354567, 0.00355749,
       0.0035049 , 0.0039741 , 0.00344515, 0.00389323, 0.00408635,
       0.00518494, 0.00353842, 0.0038866 , 0.00342941, 0.00370793,
       0.00392361, 0.00502682, 0.00683908, 0.00365534, 0.00395241,
       0.00398746, 0.00568366, 0.00383263, 0.00534773, 0.00385351,
       0.00395107, 0.00435214, 0.00369143, 0.0033453 , 0.00368805,
       0.00352316, 0.00389199, 0.00368395, 0.00378795, 0.0035398 ,
       0.00382657, 0.00395384, 0.00438023, 0.00363026, 0.0035387 ,
       0.00350723, 0.00337934, 0.00390058, 0.0036098 , 0.00349288,
       0.0034688 , 0.00380707, 0.00446758, 0.00364032, 0.00378

### 랜덤 서치

In [None]:
from scipy.stats import uniform, randint

In [None]:
# 균등 분포 샘플링
rgen = randint(0, 10)
rgen.rvs(10)

array([9, 6, 0, 7, 5, 8, 6, 4, 8, 0])

In [None]:
np.unique(rgen.rvs(1000), return_counts=True) # 빈도도 함께 출력

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 96,  95, 106, 119,  89,  81, 115,  92, 107, 100]))

In [None]:
ugen = uniform(0, 1)
ugen.rvs(10)

array([0.93983997, 0.53128381, 0.43556136, 0.26962354, 0.810993  ,
       0.76154316, 0.52165356, 0.04449456, 0.32111683, 0.45746275])

In [None]:
params = {'min_impurity_decrease': uniform(0.0001, 0.001),
          'max_depth': randint(20, 50),
          'min_samples_split': randint(2, 25),
          'min_samples_leaf': randint(1, 25),
          }

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs.fit(train_input, train_target)

In [None]:
print(rs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': np.float64(0.00034102546602601173), 'min_samples_leaf': 7, 'min_samples_split': 13}


In [None]:
print(np.max(rs.cv_results_['mean_test_score']))

0.8695428296438884


In [None]:
dt = rs.best_estimator_

print(dt.score(test_input, test_target))

0.86


In [None]:
rs.cv_results_['mean_fit_time']

array([0.01657877, 0.0160141 , 0.01195364, 0.01351438, 0.01244822,
       0.01211324, 0.00903821, 0.00911164, 0.01173949, 0.01053877,
       0.01124744, 0.01776438, 0.00789323, 0.00819793, 0.01200619,
       0.01216316, 0.01829543, 0.0149539 , 0.00864182, 0.00742979,
       0.00841079, 0.01388412, 0.01683125, 0.00813603, 0.01261859,
       0.01531944, 0.01815567, 0.01761055, 0.0113997 , 0.00755467,
       0.01668153, 0.02039285, 0.02361913, 0.0257298 , 0.02498398,
       0.01616716, 0.0187676 , 0.00938272, 0.02516437, 0.02547956,
       0.0231277 , 0.02255468, 0.02263598, 0.01902833, 0.02305522,
       0.01720943, 0.01982594, 0.02502975, 0.0247961 , 0.01981111,
       0.01937079, 0.02815495, 0.02191768, 0.02258735, 0.0234199 ,
       0.0217566 , 0.02251477, 0.02911596, 0.02345252, 0.02250881,
       0.03209581, 0.02747669, 0.0228168 , 0.01875033, 0.01809392,
       0.02015734, 0.01835113, 0.02013183, 0.01602001, 0.02034063,
       0.03008008, 0.04135356, 0.02872701, 0.02489119, 0.03206

In [None]:
print(np.mean(rs.cv_results_['mean_fit_time']))

0.01927811145782471


### 결정트리 분할 옵션 변경

In [None]:
rs2 = RandomizedSearchCV(DecisionTreeClassifier(splitter='random', random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
rs2.fit(train_input, train_target)

In [None]:
print(rs2.best_params_)
print(np.max(rs2.cv_results_['mean_test_score']))

dt = rs2.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': np.float64(0.00011407982271508446), 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077


In [None]:
rs2.cv_results_['mean_fit_time']

array([0.01289811, 0.01797409, 0.01654997, 0.01813755, 0.01097593,
       0.00877347, 0.00938807, 0.00785475, 0.01761689, 0.01618929,
       0.01278939, 0.01422429, 0.01223807, 0.0200088 , 0.01892457,
       0.01364374, 0.01831226, 0.00558653, 0.00725965, 0.01483965,
       0.01382599, 0.01690898, 0.0130774 , 0.0140698 , 0.01534462,
       0.0048758 , 0.00687528, 0.02050066, 0.01379333, 0.01312418,
       0.01267099, 0.01260157, 0.01416883, 0.00848207, 0.01172428,
       0.0123877 , 0.00390797, 0.00391412, 0.00533118, 0.00845098,
       0.00553985, 0.00623717, 0.00536213, 0.0160171 , 0.01086283,
       0.00713348, 0.00809202, 0.00383968, 0.00507903, 0.0058032 ,
       0.00774374, 0.0067008 , 0.01107984, 0.00814052, 0.01094589,
       0.00860825, 0.00643892, 0.00809917, 0.00929236, 0.00971813,
       0.00935555, 0.01494222, 0.01039305, 0.00854278, 0.006423  ,
       0.00878878, 0.00575867, 0.00678978, 0.00766106, 0.00706048,
       0.00655928, 0.01229296, 0.01386757, 0.00701771, 0.00393

In [None]:
print(np.mean(rs2.cv_results_['mean_fit_time']))

0.00974303340911865


문제 2 : 위 코드가 기존 랜덤 서치 코드와 다른 점을 2가지 적어보세요.