# Model Selection 모듈

- 훈련/테스트 데이터로 분리하지 않고 머신러닝 수행

In [4]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

In [5]:
iris = load_iris()
dtc = DecisionTreeClassifier(random_state=2021)
dtc.fit(iris.data, iris.target)
dtc.score(iris.data, iris.target)


1.0

In [8]:
from sklearn.model_selection import cross_validate 
dtc = DecisionTreeClassifier()
res = cross_validate(dtc, iris.data, iris.target)
res

{'fit_time': array([0.0030458 , 0.0010004 , 0.00206685, 0.00198174, 0.00199413]),
 'score_time': array([0.00094366, 0.00099945, 0.00194001, 0.00099969, 0.0009973 ]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ])}

In [9]:
res['test_score']

array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ])

- cross_val_score()

In [10]:
from sklearn.model_selection import cross_val_score
cross_val_score(dtc, iris.data, iris.target, scoring='accuracy', cv=5)

array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ])

In [11]:
cross_val_score(dtc, iris.data, iris.target, scoring='accuracy', cv=3)

array([0.98, 0.92, 0.96])

- GridSearchCV클래스 : 하이퍼 파라메터 튜닝 + 교차 검증

In [12]:
# Train/Test dataset 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, stratify=iris.target, test_size=0.2, random_state=2021
)

In [13]:
# 분류기와 그에 해당하는 하이퍼 파라메터
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
dtc.get_params()


{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2021,
 'splitter': 'best'}

In [14]:
params = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3,4]
}

In [15]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(
    dtc, param_grid=params, scoring='accuracy', cv=5
)

In [16]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [18]:
# 최적의 파라메터
grid_dt.best_params_

{'max_depth': 4, 'min_samples_split': 2}

In [19]:
# 최적의 파라메터로 학습한 분류기
best_clf = grid_dt.best_estimator_

In [20]:
# 최적의 파라메터로 학습한 분류기로 예측 및 평가
best_clf.score(X_test, y_test)

0.9

- Support Vector Machine

In [21]:
from sklearn.svm import SVC
svc = SVC(random_state=2021)
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2021,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [22]:
params = {'C': [0.01, 0,1, 1, 10, 100]}

In [25]:
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_base.py", line 226, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_base.py", line 277, in _dense_fit
    self._probB, self.fit_status_ = libsvm.fit(
  File "sklearn\svm\_libsvm.pyx", line 192, in sklearn.svm._libsvm.fit
ValueError: C <= 0

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_base.py", line 226, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "C:\ProgramData\Anaconda3\li

GridSearchCV(cv=5, estimator=SVC(random_state=2021),
             param_grid={'C': [0.01, 0, 1, 1, 10, 100]}, scoring='accuracy')

In [24]:
grid_sv.best_params_

{'C': 10}

In [29]:
params = {'C': [3,6,10,15,20]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 3}

In [30]:
params = {'C': [2,3,4,5]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 4}

In [31]:
best_svc = grid_sv.best_estimator_
best_svc.score(X_test, y_test)

1.0

# Wine 데이터 분류

### 1) 데이터 전처리

In [32]:
from sklearn.datasets import load_wine
wine = load_wine()

In [33]:
import pandas as pd
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['target'] = wine.target
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [34]:
df.shape

(178, 14)

In [35]:
df.target.value_counts()

1    71
0    59
2    48
Name: target, dtype: int64

In [36]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

#### 2) 훈련/테스트 데이터셋 분리

In [57]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, stratify=wine.target, test_size=0.2, random_state=2021
)

In [59]:
import numpy as np
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([47, 57, 38], dtype=int64))

#### 3) GridSearchCV를 통해서
- DecisionTreeClassifier 또는 SVC를 선택해서
- 어떤 파라메터일 때 최선의 모델이 되는지 파악하고
- 그 때의 성능을 평가함

In [61]:
# 워닝 출력 되지 않게 함
# import warnings
# warnigs.filterwarnings('ignore')

In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [42]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
dtc.get_params() 

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2021,
 'splitter': 'best'}

In [63]:
params = {
    'max_depth': [2,3,4,5,6],
    'min_samples_split': [2,3,4]
}

In [65]:
grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=5)
grid_dt.fit(X_train, y_train)
grid_dt.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [66]:
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.9722222222222222

- Support Vector Machine

In [67]:
svc = SVC(random_state=2021)
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2021,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [50]:
params = {'C': [0.01, 0,1, 1, 10, 100]}

In [51]:
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_base.py", line 226, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_base.py", line 277, in _dense_fit
    self._probB, self.fit_status_ = libsvm.fit(
  File "sklearn\svm\_libsvm.pyx", line 192, in sklearn.svm._libsvm.fit
ValueError: C <= 0

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\_base.py", line 226, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "C:\ProgramData\Anaconda3\li

GridSearchCV(cv=5, estimator=SVC(random_state=2021),
             param_grid={'C': [0.01, 0, 1, 1, 10, 100]}, scoring='accuracy')

In [52]:
grid_sv.best_params_

{'C': 100}

In [74]:
params = {'C': [30,60,100,300,500]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 500}

In [75]:
best_sv = grid_sv.best_estimator_
best_sv.score(X_test, y_test)

0.8333333333333334

In [77]:
params = {'C': [400,500,700,1000]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 1000}

In [78]:
best_sv = grid_sv.best_estimator_
best_sv.score(X_test, y_test)

0.8888888888888888

In [79]:
params = {'C': [3000,5000,10000]}
grid_sv = GridSearchCV(svc, param_grid=params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 10000}

In [80]:
best_sv = grid_sv.best_estimator_
best_sv.score(X_test, y_test)

1.0