In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.0 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import optuna

In [3]:
from sklearn.metrics import roc_auc_score

# 데이터 로드

In [4]:
import os

colab = os.path.isdir('./sample_data')
mount = os.path.isdir('/content/drive')
if colab:
    if not mount:
        from google.colab import drive
        drive.mount('/content/drive')
    base_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/dulee/'
    data_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/data/'
else:
    base_path = ''
    data_path = '../data/'

Mounted at /content/drive


In [5]:
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

train = pd.read_csv(base_path + 'train.csv', index_col='id')
test = pd.read_csv(base_path + 'test.csv', index_col='id')

##### 변수 설정

In [6]:
X = train.drop(columns=['defects'])
y = train['defects']
X_test = test

# 모델 학습

### RandomForest

##### 기본 모델

In [7]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=61, shuffle=True, stratify=y)
# model = RandomForestClassifier(random_state=61)
# model.fit(X_train, y_train)

In [8]:
# y_proba_randomforest_basic = model.predict_proba(X_val)[:, 1]
# roc_auc_score(y_val, y_proba_randomforest_basic)

##### hyper-parameter tuning

In [9]:
import os
os.cpu_count()

8

In [10]:
def optimizer(trial):

    # n_estimators = trial.suggest_categorical('n_estimators', [200, 300, 400, 500, 600, 700])
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 200)
    # max_depth = trial.suggest_int('max_depth', 11, 20)
    max_features = trial.suggest_float('max_features', 0.5, 0.8)
    # min_samples_split = trial.suggest_int('min_samples_split', 2, 50)
    # min_impurity_decrease = trial.suggest_float('min_impurity_decrease', 0.0001, 0.001)

    model = RandomForestClassifier(
        n_estimators=100,
        min_samples_leaf=min_samples_leaf,
        max_depth=9,
        max_features=max_features,
        # min_samples_split=min_samples_split,
        # min_impurity_decrease=min_impurity_decrease,
        random_state=61,
        # n_jobs=-1,
        n_jobs=os.cpu_count()-1,
    )

    folds = StratifiedKFold() # RandomForest는 shuffle이 필요 없음.
    scores = []
    for train_idx, val_idx in folds.split(X, y):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])

        y_true = y.iloc[val_idx]
        y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
        scores.append(roc_auc_score(y_true, y_proba))
    return np.mean(scores)

In [11]:
study = optuna.create_study(direction="maximize")
study.optimize(optimizer, n_trials=100)

[I 2023-10-14 16:22:17,270] A new study created in memory with name: no-name-6f3611e9-eaa6-4dd0-91f6-e5e8da8eaafd
[I 2023-10-14 16:22:38,426] Trial 0 finished with value: 0.7914225992030428 and parameters: {'min_samples_leaf': 42, 'max_features': 0.5592527275789622}. Best is trial 0 with value: 0.7914225992030428.
[I 2023-10-14 16:22:59,790] Trial 1 finished with value: 0.79146437675895 and parameters: {'min_samples_leaf': 144, 'max_features': 0.5778010050875467}. Best is trial 1 with value: 0.79146437675895.
[I 2023-10-14 16:23:20,877] Trial 2 finished with value: 0.7913181873891523 and parameters: {'min_samples_leaf': 35, 'max_features': 0.5662944923890983}. Best is trial 1 with value: 0.79146437675895.
[I 2023-10-14 16:23:43,772] Trial 3 finished with value: 0.7915231141087453 and parameters: {'min_samples_leaf': 124, 'max_features': 0.6545655223817263}. Best is trial 3 with value: 0.7915231141087453.
[I 2023-10-14 16:24:08,607] Trial 4 finished with value: 0.7914294813256475 and pa

In [22]:
print("Best roc_auc_score: %.5f" % study.best_value)
print("Best params: ", study.best_trial.params) # best score일 때의 하이퍼파라미터들

Best roc_auc_score: 0.79166
Best params:  {'min_samples_leaf': 108, 'max_features': 0.5923668617600338}


##### best model

In [13]:
# model_best = RandomForestClassifier(
#     min_samples_leaf=78,
#     max_depth=9,
#     max_features=0.6222106369274514,
#     # min_samples_split=min_samples_split,
#     # min_impurity_decrease=min_impurity_decrease,
#     random_state=61,
#     # n_jobs=-1,
#     n_jobs=os.cpu_count()-1,)

In [14]:
def oof_predict(best_model):
    folds = StratifiedKFold(random_state=61, shuffle=True)
    final_preds = []

    for train_idx, val_idx in folds.split(X, y):
        best_model.fit(X.iloc[train_idx], y.iloc[train_idx])
        final_preds.append(best_model.predict_proba(X_test)[:, 1])
    return np.mean(final_preds, axis=0)

In [15]:
model_best = RandomForestClassifier(**study.best_trial.params,
                                    random_state=61, n_jobs=os.cpu_count()-1)
model_best.fit(X, y)
y_proba = model_best.predict_proba(X_test)[:, 1]
y_proba_oof = oof_predict(model_best)

##### 분석

In [16]:
# optuna가 시도했던 모든 실험 관련 데이터
study.trials_dataframe().to_csv(base_path + 'RandomForest_param_analysis.csv')

In [17]:
study.trials_dataframe().sum()

  study.trials_dataframe().sum()


number                                                                  4950
value                                                              79.150488
duration                                              0 days 00:36:34.485479
params_max_features                                                60.421999
params_min_samples_leaf                                                10767
state                      COMPLETECOMPLETECOMPLETECOMPLETECOMPLETECOMPLE...
dtype: object

In [18]:
# 실험 기록 시각화
optuna.visualization.plot_optimization_history(study)

In [19]:
# hyper-parameter들의 중요도
optuna.visualization.plot_param_importances(study)

# 제출

In [20]:
submission_df['defects'] = y_proba
submission_df.to_csv(base_path + 'submission_RandomForest.csv')
submission_df['defects'] = y_proba_oof
submission_df.to_csv(base_path + 'submission_RandomForest_oof.csv')
submission_df

Unnamed: 0_level_0,defects
id,Unnamed: 1_level_1
101763,0.239495
101764,0.201059
101765,0.639335
101766,0.471821
101767,0.135716
...,...
169600,0.295977
169601,0.127975
169602,0.176625
169603,0.091786


In [21]:
base_path

'/content/drive/MyDrive/Colab Notebooks/5_ML_Project/dulee/'