In [1]:
import pandas as pd
wine = pd.read_csv("https://bit.ly/wine_csv_data")

In [2]:
data=wine[["alcohol","sugar","pH"]].to_numpy()
target = wine["class"].to_numpy()

In [3]:
from sklearn.model_selection import train_test_split
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

In [4]:
# 트레인 데이터를 서브 데이터와 검증 데이터로 또 나눔
sub_input, val_input, sub_target, val_target = train_test_split(train_input, train_target, test_size=0.2, random_state=42)

In [5]:
print(sub_input.shape, val_input.shape, test_input.shape)
print(sub_target.shape, val_target.shape, test_target.shape)

(4157, 3) (1040, 3) (1300, 3)
(4157,) (1040,) (1300,)


In [6]:
# 서브 데이터로 결정트리 학습후, 서브 데이터와 검증 데이터의 정확도를 구함
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)
print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


In [7]:
# cross_validate() 메서드 : 교차검증 함수
# cv 매개변수 : 교차 검증 분할 수
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, train_input, train_target)
scores

{'fit_time': array([0.00800967, 0.0069809 , 0.00733924, 0.00720835, 0.00731969]),
 'score_time': array([0.00072312, 0.00062609, 0.00063372, 0.00064731, 0.00106931]),
 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}

In [8]:
import numpy as np
print(np.mean(scores["test_score"]))

0.855300214703487


In [9]:
# StratifiedKFold() : 타깃 클래스를 골고루 나눠줌
from sklearn.model_selection import StratifiedKFold
scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores["test_score"]))

0.855300214703487


In [10]:
# n_splits 매개변수 : 구간을 몇개로 나눌지 결정
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [11]:
print(splitter)

StratifiedKFold(n_splits=10, random_state=42, shuffle=True)


In [12]:
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores["test_score"]))

0.8574181117533719


##하이퍼파라미터 튜닝 p.248

In [13]:
# GridSearchCV 클래스 : 하이퍼파라미터 탐색과 교차 검증을 한 번에 수행해줌
from sklearn.model_selection import GridSearchCV
params = {"min_impurity_decrease":[0.0001, 0.0002, 0.0003, 0.0004, 0.0005]} #탐색할 매개변수와 그 값의 리스트를 딕셔너리로 생성

In [14]:
# n_jobs 매개변수 : 병렬 실행에 사용할 CPU 코어 수를 지정
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1) #탐색 대상 모델과 params 변수를 전달하여 그리드 서치 객체 생성

In [15]:
gs.fit(train_input, train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'min_impurity_decrease': [0.0001, 0.0002, 0.0003,
                                                   0.0004, 0.0005]})

In [16]:
# best_estimator_ 속성 : 여러 모델중에서 검증 점수가 가장 높은 모델
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [17]:
# best_params_ 속성 : 최적의 매개변수
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [18]:
# cv_results_ 속성의 "mean_test_score" 키 : 각 매개변수에서 수행한 교차 검증의 평균 점수 
print(gs.cv_results_["mean_test_score"])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [19]:
# argmax() 함수 : 가장 큰 값 일떄의 인덱스 추출
best_index = np.argmax(gs.cv_results_["mean_test_score"])
print(gs.cv_results_["params"][best_index])
print(best_index)

{'min_impurity_decrease': 0.0001}
0


In [20]:
# 3가지 매개변수들과 그 값의 리스트를 딕셔너리로 지정
params = {"min_impurity_decrease" : np.arange(0.0001, 0.001, 0.0001), #np.arrange()는 소수도 입력 가능
          "max_depth" : range(5,20,1), # 파이썬 range()는 정수만 입력 가능
          "min_samples_split" : range(2,100,10)}

In [21]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1) #존재하는 모든 cpu 코어 사용
gs.fit(train_input, train_target)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': range(5, 20),
                         'min_impurity_decrease': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
       0.0009]),
                         'min_samples_split': range(2, 100, 10)})

In [22]:
import multiprocessing
cores = multiprocessing.cpu_count()
print(cores)

2


In [23]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [24]:
print(np.max(gs.cv_results_["mean_test_score"]))

0.8683865773302731


###랜덤 서치

In [25]:
# 싸이파이의 stats 서브 패키지의 uniform 클래스 : 균등 분포에서 실숫값을 샘플링함
# 싸이파이의 stats 서브 패키지의 randint 클래스 : 균등 분포에서 정숫값을 샘플링함
# rvs() 매서드 : n개의 값을 샘플링
from scipy.stats import uniform, randint
rgen = randint(0,10) #0부터 9까지
rgen.rvs(10)

array([6, 1, 2, 1, 7, 5, 5, 1, 5, 8])

In [26]:
np.unique(rgen.rvs(1000), return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 83,  94,  99,  91, 108, 118,  92, 107, 118,  90]))

In [27]:
ugen = uniform(0,1)
ugen.rvs(10)

array([0.77425797, 0.44679388, 0.49087688, 0.63281312, 0.266556  ,
       0.61332114, 0.19169349, 0.66268168, 0.20705753, 0.27794959])

In [28]:
params = {"min_impurity_decrease" : uniform(0.0001, 0.001),
          "max_depth" : randint(20, 50),
          "min_samples_split" : randint(2, 25),
          "min_samples_leaf" : randint(1, 25),}

In [29]:
from sklearn.model_selection import RandomizedSearchCV
gs = RandomizedSearchCV(DecisionTreeClassifier(random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)

RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f7966bf1190>,
                                        'min_impurity_decrease': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f7966bf1410>,
                                        'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f7967510b90>,
                                        'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f7966bf1250>},
                   random_state=42)

In [30]:
print(gs.best_params_)

{'max_depth': 39, 'min_impurity_decrease': 0.00034102546602601173, 'min_samples_leaf': 7, 'min_samples_split': 13}


In [31]:
print(np.max(gs.cv_results_["mean_test_score"]))

0.8695428296438884


In [32]:
dt = gs.best_estimator_
print(dt.score(test_input, test_target))

0.86


예제

In [33]:
gs = RandomizedSearchCV(DecisionTreeClassifier(splitter="random", random_state=42), params,
                        n_iter=100, n_jobs=-1, random_state=42)
gs.fit(train_input, train_target)
print(gs.best_params_)
print(np.max(gs.cv_results_["mean_test_score"]))
dt = gs.best_estimator_
print(dt.score(test_input, test_target))

{'max_depth': 43, 'min_impurity_decrease': 0.00011407982271508446, 'min_samples_leaf': 19, 'min_samples_split': 18}
0.8458726956392981
0.786923076923077
