In [25]:
# !pip install optuna

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import optuna

In [27]:
from sklearn.metrics import roc_auc_score

# 데이터 로드

In [28]:
import os

colab = os.path.isdir('./sample_data')
mount = os.path.isdir('/content/drive')
if colab:
    if not mount:
        from google.colab import drive
        drive.mount('/content/drive')
    base_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/dulee/'
    data_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/data/'
else:
    base_path = ''
    data_path = '../data/'

In [29]:
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

train = pd.read_csv(base_path + 'train.csv', index_col='id')
test = pd.read_csv(base_path + 'test.csv', index_col='id')

##### 변수 설정

In [30]:
X = train.drop(columns=['defects'])
y = train['defects']
X_test = test

# 모델 학습

### RandomForest

##### 기본 모델

In [31]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=61, shuffle=True, stratify=y)
# model = RandomForestClassifier(random_state=61)
# model.fit(X_train, y_train)

In [32]:
# y_proba_randomforest_basic = model.predict_proba(X_val)[:, 1]
# roc_auc_score(y_val, y_proba_randomforest_basic)

##### hyper-parameter tuning

In [33]:
import os
os.cpu_count()

12

In [34]:
def optimizer(trial):

    # n_estimators = trial.suggest_categorical('n_estimators', [200, 300, 400, 500, 600, 700])
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 200)
    # max_depth = trial.suggest_int('max_depth', 11, 20)
    max_features = trial.suggest_float('max_features', 0.5, 0.8)
    # min_samples_split = trial.suggest_int('min_samples_split', 2, 50)
    # min_impurity_decrease = trial.suggest_float('min_impurity_decrease', 0.0001, 0.001)

    model = RandomForestClassifier(
        n_estimators=100,
        min_samples_leaf=min_samples_leaf,
        max_depth=9,
        max_features=max_features,
        # min_samples_split=min_samples_split,
        # min_impurity_decrease=min_impurity_decrease,
        random_state=61,
        # n_jobs=-1,
        n_jobs=os.cpu_count()-1,
    )

    folds = StratifiedKFold() # RandomForest는 shuffle이 필요 없음.
    scores = []
    for train_idx, val_idx in folds.split(X, y):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])

        y_true = y.iloc[val_idx]
        y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
        scores.append(roc_auc_score(y_true, y_proba))
    return np.mean(scores)

In [35]:
study = optuna.create_study(direction="maximize")
study.optimize(optimizer, n_trials=100)

[I 2023-10-14 10:12:26,205] A new study created in memory with name: no-name-3f76d21f-9462-4c3f-ba6e-fc3657be36e0
[I 2023-10-14 10:12:41,934] Trial 0 finished with value: 0.7915537506613177 and parameters: {'min_samples_leaf': 55, 'max_features': 0.6121218590124704}. Best is trial 0 with value: 0.7915537506613177.
[I 2023-10-14 10:12:58,950] Trial 1 finished with value: 0.7915371034725169 and parameters: {'min_samples_leaf': 43, 'max_features': 0.6440560776461831}. Best is trial 0 with value: 0.7915537506613177.
[I 2023-10-14 10:13:13,392] Trial 2 finished with value: 0.7913450153824725 and parameters: {'min_samples_leaf': 61, 'max_features': 0.5312052459915969}. Best is trial 0 with value: 0.7915537506613177.
[I 2023-10-14 10:13:32,703] Trial 3 finished with value: 0.7915218720846143 and parameters: {'min_samples_leaf': 137, 'max_features': 0.7948961469084206}. Best is trial 0 with value: 0.7915537506613177.
[I 2023-10-14 10:13:47,612] Trial 4 finished with value: 0.791376507732912 an

In [36]:
print("Best roc_auc_score: %.4f" % study.best_value)
print("Best params: ", study.best_trial.params) # best score일 때의 하이퍼파라미터들

Best roc_auc_score: 0.7917
Best params:  {'min_samples_leaf': 78, 'max_features': 0.6222106369274514}


##### best model

In [37]:
model_best = RandomForestClassifier(**study.best_trial.params, random_state=61)
model_best.fit(X, y)

In [38]:
y_proba = model_best.predict_proba(X_test)[:, 1]
y_proba

array([0.2230157 , 0.19362409, 0.6032844 , ..., 0.15667104, 0.08968615,
       0.84649445])

##### 분석

In [39]:
# optuna가 시도했던 모든 실험 관련 데이터
study.trials_dataframe().to_csv(base_path + 'RandomForest_param_analysis.csv')

In [40]:
study.trials_dataframe().sum()

  study.trials_dataframe().sum()


number                                                                  4950
value                                                              79.150312
duration                                              0 days 00:25:43.416854
params_max_features                                                59.794095
params_min_samples_leaf                                                 9271
state                      COMPLETECOMPLETECOMPLETECOMPLETECOMPLETECOMPLE...
dtype: object

In [41]:
# 실험 기록 시각화
optuna.visualization.plot_optimization_history(study)

In [42]:
# hyper-parameter들의 중요도
optuna.visualization.plot_param_importances(study)

# 제출

In [43]:
submission_df['defects'] = y_proba
submission_df.to_csv(base_path + 'submission_randomforest.csv')
submission_df

Unnamed: 0_level_0,defects
id,Unnamed: 1_level_1
101763,0.223016
101764,0.193624
101765,0.603284
101766,0.476255
101767,0.143227
...,...
169600,0.283164
169601,0.139627
169602,0.156671
169603,0.089686
