### GridSearchCV
##### 유방암(Breast Cancer) 데이터

In [29]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

1) 데이터 탐색 및 전처리

In [30]:
cancer.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [31]:
import pandas as pd
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target

df.head()


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [32]:
df.shape, df.describe

((569, 31),
 <bound method NDFrame.describe of      mean radius  mean texture  mean perimeter  mean area  mean smoothness   
 0          17.99         10.38          122.80     1001.0          0.11840  \
 1          20.57         17.77          132.90     1326.0          0.08474   
 2          19.69         21.25          130.00     1203.0          0.10960   
 3          11.42         20.38           77.58      386.1          0.14250   
 4          20.29         14.34          135.10     1297.0          0.10030   
 ..           ...           ...             ...        ...              ...   
 564        21.56         22.39          142.00     1479.0          0.11100   
 565        20.13         28.25          131.20     1261.0          0.09780   
 566        16.60         28.08          108.30      858.1          0.08455   
 567        20.60         29.33          140.10     1265.0          0.11780   
 568         7.76         24.54           47.92      181.0          0.05263   
 
    

In [33]:
df.target.value_counts()

target
1    357
0    212
Name: count, dtype: int64

2) 훈련/테스트 데이터 셋 분리

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target,
    test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((455, 30), (114, 30), (455,), (114,))

3) 학습

In [35]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2023)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2023,
 'splitter': 'best'}

In [36]:
dtc.fit(X_train, y_train)

4) 예측

In [37]:
pred = dtc.predict(X_test)

In [38]:
res_df = pd.DataFrame({'y 실제값':y_test, 'y 예측값': pred})
res_df.head()

Unnamed: 0,y 실제값,y 예측값
0,0,0
1,1,1
2,1,1
3,1,1
4,1,1


5) 평가


In [39]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.9210526315789473

GridSearchCV로 수행
- 학습/훈련시 수행

In [40]:
params = {
    'max_depth' : [2, 5, 8],
    'min_samples_split' : [2, 3, 4]
}

In [41]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(
    dtc,                # estimator, Decision Tree Classifier
    param_grid=params,  # 파라메터의 조합
    scoring='accuracy', # 평가 방법 - 정확도
    cv=5                # 교차검증 세트 수
)

# 총 3 X 3 X 5 = 45회 훈련

In [42]:
# 학습
grid_dt.fit(X_train, y_train)

In [43]:
# 베스트 파라메터 조합
grid_dt.best_params_

{'max_depth': 5, 'min_samples_split': 2}

In [44]:
# 베스트 스코아
grid_dt.best_score_

0.9472527472527472

- 나머지 파라미터에 적용

In [45]:
params = {
    'max_depth' : [4, 5, 6],
    'min_samples_split' : [1, 2, 3]
}

In [46]:
grid_dt = GridSearchCV(
    dtc,
    param_grid=params,
    scoring='accuracy',
    cv=5
)
grid_dt.fit(X_train, y_train)

15 fits failed out of a total of 45.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\YONSAI\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\YONSAI\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 889, in fit
    super().fit(
  File "c:\Users\YONSAI\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 177, in fit
    self._validate_params()
  File "c:\Users\YONSAI\anaconda3\lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\YONSAI\anaconda3\lib\site

In [47]:
grid_dt.best_params_

{'max_depth': 5, 'min_samples_split': 2}

- 최적의 분류기로 예측 및 평가

In [52]:
# 최적 분류기
# best_dt = DecisionTreeClassifier( max_depth=5, min_samples_split=2, random_state=2023)
best_dt = grid_dt.best_estimator_
print(best_dt)


DecisionTreeClassifier(max_depth=5, random_state=2023)


In [49]:
best_dt.score(X_test, y_test)

0.8947368421052632