### GridSearchCV

- 유방암(breast cancer)데이터

In [5]:
from sklearn.datasets import load_breast_cancer
import pandas as pd

In [2]:
lbc = load_breast_cancer()

1. 데이터 탐색 및 전처리

In [4]:
lbc.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [14]:
df = pd.DataFrame(lbc.data, columns=lbc.feature_names)
df['target'] = lbc.target
df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [9]:
df.target.value_counts()

1    357
0    212
Name: target, dtype: int64

In [13]:
df.shape

(569, 31)

2. 훈련/학습 데이터 분리

In [15]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    lbc.data, lbc.target, stratify=lbc.target,
    test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

3. 학습

In [18]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2023)

In [19]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2023,
 'splitter': 'best'}

In [20]:
dtc.fit(X_train, y_train)

4. 예측

In [21]:
pred = dtc.predict(X_test)

In [23]:
res_df = pd.DataFrame({'y 실제값': y_test, 'y 예측값': pred})
res_df

Unnamed: 0,y 실제값,y 예측값
0,0,0
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
109,0,1
110,1,1
111,1,1
112,1,1


In [24]:
lbc.target_names

array(['malignant', 'benign'], dtype='<U9')

5. 평가

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9210526315789473

#### GridSearchCV로 수행

- 학습,훈련시 사용

In [26]:
params = {
    'max_depth': [2, 5, 8],
    'min_samples_split': [2, 3, 4],    
}

In [27]:
from sklearn.model_selection import GridSearchCV

In [28]:
grid_dt = GridSearchCV(
    dtc,                    # estimator, Decision Tree Classifier
    param_grid=params,      # 파라미터의 조합
    scoring='accuracy',     # 평가 방법 - 정확도
    cv=5                    # 교차검증 세트수
)
# 총 3(max_depth) * 3(min_samples_split) * 5(cross validation)회 검증

In [30]:
# 학습
grid_dt.fit(X_train, y_train)

In [31]:
# 베스트 파라미터 조합 찾기
grid_dt.best_params_

{'max_depth': 5, 'min_samples_split': 2}

In [36]:
# 베스트 스코어
grid_dt.best_score_

0.9472527472527472

- 나머지 파라미터 검증

In [37]:
params = {
    'max_depth': [4,5,6],
    'min_samples_split': [2,3,4]
}

In [38]:
grid_dt = GridSearchCV(
    dtc,
    param_grid=params,
    scoring='accuracy',
    cv=5
)

In [39]:
grid_dt.fit(X_train, y_train)

In [40]:
grid_dt.best_params_

{'max_depth': 5, 'min_samples_split': 2}

In [41]:
grid_dt.best_score_

0.9472527472527472

- 최적의 분류기로 예측 및 평가

In [42]:
# 최적 분류기
best_dt = grid_dt.best_estimator_       # best_dt = DecisionClassifier(max_depth=5, min_samples_split=2, random_state=2023)

In [43]:
best_dt.score(X_test, y_test)

0.8947368421052632