In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/multi/0422

/content/drive/MyDrive/multi/0422


### Model Selection 모듈

- 훈련 데이터와 테스트 데이터로 분리하지 않고 수행

In [3]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

In [4]:
iris = load_iris()
dtc = DecisionTreeClassifier(random_state=2022)
dtc.fit(iris.data, iris.target)
dtc.score(iris.data, iris.target)

1.0

- 교차 검증: cross_validate() + 평가
- 교차 검증 후 평가까지: cross_val_score()

In [6]:
from sklearn.model_selection import cross_val_score

cross_val_score(dtc, iris.data, iris.target, scoring='accuracy', cv=5).mean()

0.9533333333333334

In [7]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2022,
 'splitter': 'best'}

- GridSearchCV 클래스: 하이퍼 파라미터 튜닝 + 교차 검증

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, stratify=iris.target, test_size=0.2, random_state=2022
)

In [9]:
# classifier와 그에 해당하는 hyper parameter

dtc = DecisionTreeClassifier(random_state=2022)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2022,
 'splitter': 'best'}

In [11]:
params = {
    'max_depth' : [2,3,4,5,6],
    'min_samples_split' : [2,3,4]
}

In [12]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=5)

In [13]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2022),
             param_grid={'max_depth': [2, 3, 4, 5, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [14]:
# best Parameter
grid_dt.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [15]:
grid_dt.best_score_

0.9916666666666666

In [16]:
# 최적의 분류기
# dtc_best = DecisionTreeClassifier(max_depth=3, min_samples_split=2, random_state=2022)
dtc_best = grid_dt.best_estimator_

In [17]:
# 최적의 분류기로 예측 및 평가

dtc_best.score(X_test, y_test)

0.9333333333333333

- SVM (Support Vector Machine)

In [18]:
from sklearn.svm import SVC

svc = SVC(random_state=2022)
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2022,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [19]:
# 범위를 좁혀가며 최적의 파라미터를 찾아간다.

params = {
    'C': [0.01, 0.1, 1.0, 10, 100]
}

grid_sv = GridSearchCV(svc, params, scoring='accuracy', cv=5)

In [20]:
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 1.0}

In [22]:
params = {
    'C': [0.2, 0.6, 1.0, 4, 8]
}

grid_sv = GridSearchCV(svc, params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 4}

In [23]:
params = {
    'C': [2,3,4,5,6]
}

grid_sv = GridSearchCV(svc, params, scoring='accuracy', cv=5)
grid_sv.fit(X_train, y_train)
grid_sv.best_params_

{'C': 4}

In [25]:
svc_best = grid_sv.best_estimator_
svc_best.score(X_test, y_test)

0.9666666666666667