In [1]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine_csv_data')
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [2]:
data = wine[['alcohol', 'sugar', 'pH']]
target = wine[['class']]

In [3]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = \
train_test_split(data, target, random_state=42)

In [12]:
sub_input, val_input, sub_target, val_target = \
train_test_split(train_input, train_target, random_state=42)

In [13]:
print(data.shape, train_input.shape, test_input.shape)
print(sub_input.shape, val_input.shape)

(6497, 3) (4872, 3) (1625, 3)
(3654, 3) (1218, 3)


In [14]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

#과대적합 확인 가능

0.9978106185002736
0.8571428571428571


### 교차 검증
- cross_validate

In [23]:
from sklearn.model_selection import cross_validate

score = cross_validate(dt, train_input, train_target)
score

# fit_time-> 훈련에 걸린 시간
# score_time -> 검증에 걸린 시간
# 'test_score' -> 검증 폴드를 수행한 결과 => 평균치를 내야함

{'fit_time': array([0.00897026, 0.00697517, 0.00601912, 0.00497484, 0.00596118]),
 'score_time': array([0.00299335, 0.00198746, 0.00199485, 0.00200701, 0.00200534]),
 'test_score': array([0.85128205, 0.84820513, 0.8788501 , 0.85112936, 0.84394251])}

In [24]:
score['test_score']

array([0.85128205, 0.84820513, 0.8788501 , 0.85112936, 0.84394251])

In [25]:
import numpy as np
np.mean(score['test_score'])

np.float64(0.8546818301479492)

In [31]:
#만약 훈련세트가 정렬이 되어있는 dt라면 섞어주는 함수 사용 필요// KFold

from sklearn.model_selection import StratifiedKFold

splitter = StratifiedKFold(n_splits=10, shuffle=True)
score = cross_validate(dt, train_input, train_target, cv =splitter)
score

{'fit_time': array([0.00704503, 0.00892782, 0.00634909, 0.00597882, 0.00597978,
        0.00696325, 0.00816512, 0.00597191, 0.00597787, 0.00901055]),
 'score_time': array([0.00199437, 0.00202513, 0.0020504 , 0.00200772, 0.00100517,
        0.00201702, 0.00402164, 0.00200915, 0.00195122, 0.00099587]),
 'test_score': array([0.875     , 0.875     , 0.86858316, 0.84599589, 0.87063655,
        0.85626283, 0.84394251, 0.87063655, 0.87063655, 0.84394251])}

## 하이퍼파라미터 튜닝

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
params = {
    'min_impurity_decrease' : [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]
}

In [37]:
dt = DecisionTreeClassifier(random_state=42)
gs = GridSearchCV(dt, params)

In [38]:
gs.fit(train_input, train_target)

In [39]:
gs.best_estimator_

In [40]:
gs.best_params_

{'min_impurity_decrease': 0.0003}

In [41]:
gs.cv_results_

{'mean_fit_time': array([0.00550718, 0.00420036, 0.0037961 , 0.00505261, 0.00338802]),
 'std_fit_time': array([0.00050405, 0.00041241, 0.00040087, 0.00246923, 0.00048786]),
 'mean_score_time': array([0.0025136 , 0.00213199, 0.00178761, 0.00240126, 0.00199361]),
 'std_score_time': array([4.42376493e-04, 2.97208642e-04, 3.92377166e-04, 1.02696381e-03,
        6.28991411e-07]),
 'param_min_impurity_decrease': masked_array(data=[0.0001, 0.0002, 0.0003, 0.0004, 0.0005],
              mask=[False, False, False, False, False],
        fill_value=1e+20),
 'params': [{'min_impurity_decrease': 0.0001},
  {'min_impurity_decrease': 0.0002},
  {'min_impurity_decrease': 0.0003},
  {'min_impurity_decrease': 0.0004},
  {'min_impurity_decrease': 0.0005}],
 'split0_test_score': array([0.87384615, 0.87076923, 0.87282051, 0.86461538, 0.86051282]),
 'split1_test_score': array([0.86666667, 0.86871795, 0.87794872, 0.88512821, 0.87794872]),
 'split2_test_score': array([0.88603696, 0.88295688, 0.8798768 , 0.87

In [45]:
import numpy as np

params = {
    'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
    'max_depth': range(5, 20, 1),
    'min_samples_split': range(2, 100, 10),
}

In [46]:
dt = DecisionTreeClassifier(random_state=42)

gs = GridSearchCV(dt, params, n_jobs=-1)  #n_jobs=-1 -> CPU 다 쓰겠다는 뜻

In [47]:
gs.fit(train_input, train_target)

In [49]:
gs.best_params_

{'max_depth': 15,
 'min_impurity_decrease': np.float64(0.0001),
 'min_samples_split': 22}

In [52]:
gs.cv_results_['mean_test_score']

array([0.85837161, 0.85837161, 0.85837161, ..., 0.86309693, 0.86309693,
       0.86309693])

### 랜덤 서치
균등하게 데이터를 뽑는 것이 목표
uniform - 실수
randint - 정수

In [58]:
from scipy.stats import uniform, randint

In [59]:
randint(0, 10).rvs(10)   #0~10 숫자 중 랜덤 10개 추출

array([0, 4, 1, 5, 9, 7, 7, 0, 3, 4])

In [60]:
rgen = randint(0, 10)
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([110, 100,  91,  94, 109, 102,  95,  87, 104, 108]))

In [61]:
params = {
    'min_impurity_decrease': uniform(0.0001, 0.001),
    'max_depth': randint(20, 50),
    'min_samples_split': randint(2, 25),
    'min_samples_leaf': randint(1, 25),
}

In [62]:
from sklearn.model_selection import RandomizedSearchCV

In [63]:
#n_iter -> n번 반복
gs = RandomizedSearchCV(dt, params, n_iter=100, n_jobs=-1)
gs.fit(train_input, train_target)

In [64]:
gs.best_estimator_

In [65]:
gs.best_params_

{'max_depth': 44,
 'min_impurity_decrease': np.float64(0.00027856728394606324),
 'min_samples_leaf': 5,
 'min_samples_split': 20}