In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.0 colorlog-6.7.0 optuna-3.4.0


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import optuna, pickle

# 데이터 로드

In [3]:
import os

colab = os.path.isdir('./sample_data')
mount = os.path.isdir('/content/drive')
if colab:
    if not mount:
        from google.colab import drive
        drive.mount('/content/drive')
    base_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/dulee/'
    data_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/data/'
else:
    base_path = ''
    data_path = '../data/'

Mounted at /content/drive


In [4]:
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

train = pd.read_csv(base_path + 'train.csv', index_col='id')
test = pd.read_csv(base_path + 'test.csv', index_col='id')

##### 변수 설정

In [5]:
X = train.drop(columns=['defects'])
y = train['defects']
X_test = test

K = 15

# 모델 학습

### RandomForest

##### 기본 모델

In [6]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=61, shuffle=True, stratify=y)
# model = RandomForestClassifier(random_state=61)
# model.fit(X_train, y_train)

In [7]:
# y_proba_randomforest_basic = model.predict_proba(X_val)[:, 1]
# roc_auc_score(y_val, y_proba_randomforest_basic)

##### hyper-parameter tuning

In [8]:
import os
os.cpu_count()

8

In [9]:
def optimizer(trial):

    # n_estimators = trial.suggest_categorical('n_estimators', [200, 300, 400, 500, 600, 700])
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 200)
    # max_depth = trial.suggest_int('max_depth', 11, 20)
    max_features = trial.suggest_float('max_features', 0.5, 0.8)
    # min_samples_split = trial.suggest_int('min_samples_split', 2, 50)
    # min_impurity_decrease = trial.suggest_float('min_impurity_decrease', 0.0001, 0.001)

    model = RandomForestClassifier(
        n_estimators=100,
        min_samples_leaf=min_samples_leaf,
        max_depth=9,
        max_features=max_features,
        # min_samples_split=min_samples_split,
        # min_impurity_decrease=min_impurity_decrease,
        random_state=61,
        # n_jobs=-1,
        n_jobs=os.cpu_count()-1,
    )

    folds = StratifiedKFold(n_splits = K) # RandomForest는 shuffle이 필요 없음.
    scores = []
    for train_idx, val_idx in folds.split(X, y):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])

        y_true = y.iloc[val_idx]
        y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
        scores.append(roc_auc_score(y_true, y_proba))
    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(optimizer, n_trials=100)

[I 2023-10-17 13:38:04,798] A new study created in memory with name: no-name-dafd4efd-c5b3-459e-ac81-44730726dfdb
[I 2023-10-17 13:41:26,138] Trial 0 finished with value: 0.8989424871554429 and parameters: {'min_samples_leaf': 108, 'max_features': 0.6887567981095749}. Best is trial 0 with value: 0.8989424871554429.
[I 2023-10-17 13:44:58,039] Trial 1 finished with value: 0.8961650330523615 and parameters: {'min_samples_leaf': 163, 'max_features': 0.7264604814503048}. Best is trial 0 with value: 0.8989424871554429.
[I 2023-10-17 13:48:05,922] Trial 2 finished with value: 0.8940698623444255 and parameters: {'min_samples_leaf': 181, 'max_features': 0.6610019816497058}. Best is trial 0 with value: 0.8989424871554429.
[I 2023-10-17 13:51:43,218] Trial 3 finished with value: 0.9020302918926452 and parameters: {'min_samples_leaf': 62, 'max_features': 0.7543624946406768}. Best is trial 3 with value: 0.9020302918926452.
[I 2023-10-17 13:55:30,461] Trial 4 finished with value: 0.8997765113235225

In [None]:
print("Best roc_auc_score: %.5f" % study.best_value)
print("Best params: ", study.best_trial.params)
print(study.trials_dataframe()['duration'].sum())

##### best model

In [None]:
# model_best = RandomForestClassifier(
#     min_samples_leaf=78,
#     max_depth=9,
#     max_features=0.6222106369274514,
#     # min_samples_split=min_samples_split,
#     # min_impurity_decrease=min_impurity_decrease,
#     random_state=61,
#     # n_jobs=-1,
#     n_jobs=os.cpu_count()-1,)

In [None]:
def oof_predict(best_model):
    folds = StratifiedKFold(n_splits = K, random_state=61, shuffle=True)
    predicted_probas = []

    for train_idx, val_idx in folds.split(X, y):
        best_model.fit(X.iloc[train_idx], y.iloc[train_idx])
        predicted_probas.append(best_model.predict_proba(X_test)[:, 1])
    return np.mean(predicted_probas, axis=0)

In [None]:
model_best = RandomForestClassifier(**study.best_trial.params,
                                    random_state=61, n_jobs=-1)
model_best.fit(X, y)
y_proba = model_best.predict_proba(X_test)[:, 1]
y_proba_oof = oof_predict(model_best)

In [None]:
# save model
pickle.dump(model_best, open(base_path + "rf_best.pickle", "wb"))

##### 분석

In [None]:
# optuna가 시도했던 모든 실험 관련 데이터
param_analysis = study.trials_dataframe().sort_values(by=['value'], ascending=False)
param_analysis.to_csv(base_path + 'RF_param_analysis.csv')

fig, axes = plt.subplots(2, 3, figsize=(12, 8))
for col, ax in zip(param_analysis.columns[5:-1], axes.ravel()):
    sns.scatterplot(param_analysis, x='value', y=col, ax=ax)
plt.show()
fig.savefig(base_path + 'RF_param_analysis.png')

In [None]:
# 실험 기록 시각화
optuna.visualization.plot_optimization_history(study)

In [None]:
# hyper-parameter들의 중요도
optuna.visualization.plot_param_importances(study)

# 제출

In [None]:
submission_df['defects'] = y_proba
submission_df.to_csv(base_path + 'submission_RandomForest.csv')
submission_df['defects'] = y_proba_oof
submission_df.to_csv(base_path + 'submission_RandomForest_oof.csv')
submission_df