**하이퍼 파라미터 탐색**
- 교차 검증 점수를 기반으로 최적의 하이퍼 파라미터를 탐색
- **GridSearchCV(estimator, param_grid...)**
  - ```python
    from sklearn.model_selection import GridSearchCV
    grid_model = GridSearchCV(estimator=estimator,
                              param_grid=param_grid,
                              cv=cv)
    grid_model.fit(X)

    print(f'best_score_: {grid_model.best_score_}')
    print(f'best_params: {grid_model.best_params_}')
    print(f'best_estimator_: {grid_model.best_estimator_}')
    ```
  - param_grid를 이용하여 모든 하이퍼 파라미터 조합에 대해 학습을 수행하고, 가장 좋은 성능을 내는 조합을 찾아냄 
  - best_score_ - 최적의 성능
  - best_params_ - best_score_를 만든 하이퍼 파라미터 조합
  - best_model_ - best_params_를 사용하여 생성된 모델 객체 

## Lecture 01 - 하이퍼 파라미터 탐색



In [None]:
# load dataset
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)

# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y)

In [None]:
# define KFold for cross validation
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True)

# define base model for cross validation
from sklearn.ensemble import GradientBoostingClassifier
base_model = GradientBoostingClassifier()

# define hyper parameters
param_grid = {
    'learning_rate': [0.1, 0.2, 0.3, 1.0, 0.01],
    'max_depth': [1, 2, 3],
    'n_estimators': [100, 200, 300, 10, 50],
}

# define GridSearchCV for searching best hyper parameters
from sklearn.model_selection import GridSearchCV
grid_model = GridSearchCV(estimator=base_model,
                          param_grid=param_grid,
                          cv=cv)
grid_model.fit(X_train, y_train)

# print result
print(f'best_score_: {grid_model.best_score_}')
print(f'best_params: {grid_model.best_params_}')
print(f'best_estimator_: {grid_model.best_estimator_}')

# evaluate model
print(f'SCORE(TRAIN): {grid_model.score(X_train, y_train)}')
print(f' SCORE(TEST): {grid_model.score(X_test, y_test)}')

best_score_: 0.9666666666666668
best_params: {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 10}
best_estimator_: GradientBoostingClassifier(max_depth=1, n_estimators=10)
SCORE(TRAIN): 0.9833333333333333
 SCORE(TEST): 0.9


## Lecture 02 - 파이프라인

In [None]:
# load dataset
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)

# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    stratify=y)

In [None]:
# define StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# define base model for cross validation
from sklearn.linear_model import LogisticRegression
base_model = LogisticRegression(l1_ratio=0)

# define pipeline
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', scaler),
                 ('base_model', base_model)])

# define hyper parameters
param_grid = [{'base_model__penalty': ['l2'],
               'base_model__solver': ['lbfgs'],
               'base_model__C': [1.0, 0.1, 10, 0.01, 100],
               'base_model__class_weight': ['balanced', {0: 0.9, 1: 0.1}]},
              {'base_model__penalty': ['elasticnet'],
               'base_model__solver': ['saga'],
               'base_model__C': [1.0, 0.1, 10, 0.01, 100],
               'base_model__class_weight': ['balanced', {0: 0.9, 1: 0.1}]}]

# define KFold
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True)

# define GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_model = GridSearchCV(pipe,
                          param_grid=param_grid,
                          scoring='recall',
                          cv=cv)

grid_model.fit(X_train, y_train)

In [None]:
# print result
print(f'best_score_: {grid_model.best_score_}')
print(f'best_params: {grid_model.best_params_}')
print(f'best_estimator_: {grid_model.best_estimator_}')

print('\n')

# evaluate model
print(f'SCORE(TRAIN): {grid_model.score(X_train, y_train)}')
print(f' SCORE(TEST): {grid_model.score(X_test, y_test)}')

print('\n')

# print metrics
from sklearn.metrics import classification_report
pred_train = grid_model.predict(X_train)
pred_test = grid_model.predict(X_test)
print(classification_report(y_train, pred_train))
print(classification_report(y_test, pred_test))

best_score_: 0.980020544427324
best_params: {'base_model__C': 0.01, 'base_model__class_weight': 'balanced', 'base_model__penalty': 'l2', 'base_model__solver': 'lbfgs'}
best_estimator_: Pipeline(steps=[('scaler', StandardScaler()),
                ('base_model',
                 LogisticRegression(C=0.01, class_weight='balanced',
                                    l1_ratio=0))])


SCORE(TRAIN): 0.9894736842105263
 SCORE(TEST): 1.0


              precision    recall  f1-score   support

           0       0.98      0.94      0.96       170
           1       0.96      0.99      0.98       285

    accuracy                           0.97       455
   macro avg       0.97      0.96      0.97       455
weighted avg       0.97      0.97      0.97       455

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        42
           1       1.00      1.00      1.00        72

    accuracy                           1.00       114
   macro avg      

## Lecture 03 - 응용 (데이터 전처리, 교차 검증, 하이퍼 파라미터 탐색...)

In [None]:
# import pandas
import pandas as pd

pd.options.display.max_columns = 5
pd.options.display.max_rows = 10

In [None]:
# load dataset 
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing()

# set X, y
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

print(X)

# split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2)

       MedInc  HouseAge  ...  Latitude  Longitude
0      8.3252      41.0  ...     37.88    -122.23
1      8.3014      21.0  ...     37.86    -122.22
2      7.2574      52.0  ...     37.85    -122.24
3      5.6431      52.0  ...     37.85    -122.25
4      3.8462      52.0  ...     37.85    -122.25
...       ...       ...  ...       ...        ...
20635  1.5603      25.0  ...     39.48    -121.09
20636  2.5568      18.0  ...     39.49    -121.21
20637  1.7000      17.0  ...     39.43    -121.22
20638  1.8672      18.0  ...     39.43    -121.32
20639  2.3886      16.0  ...     39.37    -121.24

[20640 rows x 8 columns]


In [None]:
# define scaler
scalers = []

# define MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scalers.append(MinMaxScaler())

# define StandardScaler
from sklearn.preprocessing import StandardScaler
scalers.append(StandardScaler())

# define RobustScaler
from sklearn.preprocessing import RobustScaler
scalers.append(RobustScaler())

# define columns
columns = []

# define columns for MinMaxScaler
columns.append(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms'])

# define columns for StandardScaler
columns.append(['AveOccup', 'Latitude', 'Longitude'])

# define columns for RobustScaler
columns.append(['Population'])

# define ColumnTransformer
from sklearn.compose import ColumnTransformer
transformer = ColumnTransformer([(f'scaler{i}', scalers[i], columns[i]) for i in range(3)])

# define model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

# define pipeline
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', transformer), ('model', model)])

# define KFold
from sklearn.model_selection import KFold
cv = KFold(n_splits=15, shuffle=True)

# define hyper parameters
param_grid = {'model__n_estimators': [100, 50, 20, 10, 25],
              'model__max_depth': [None, 4, 7, 9, 13],
              'model__max_samples': [None, 0.5, 0.3, 0.7]}

# define GridSearchCV
from sklearn.model_selection import GridSearchCV
grid_model = GridSearchCV(estimator=pipe,
                          param_grid=param_grid,
                          cv=cv,
                          scoring='r2')

grid_model.fit(X_train, y_train)

# evaluate model
print(f'SCORE(TRAIN): {grid_model.score(X_train, y_train)}')
print(f' SCORE(TEST): {grid_model.score(X_test, y_test)}')

SCORE(TRAIN): 0.973918801661261
 SCORE(TEST): 0.8163378130064459
