In [1]:
# !pip install optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.0 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

import optuna

In [None]:
from sklearn.metrics import roc_auc_score

# 데이터 로드

In [None]:
import os

colab = os.path.isdir('./sample_data')
mount = os.path.isdir('/content/drive')
if colab:
    if not mount:
        from google.colab import drive
        drive.mount('/content/drive')
    base_path = '/content/drive/Othercomputers/내 컴퓨터/5_ML_Project/dulee/'
    data_path = '/content/drive/Othercomputers/내 컴퓨터/5_ML_Project/data/'
else:
    base_path = ''
    data_path = '../data/'

In [None]:
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

train = pd.read_csv(base_path + 'train.csv', index_col='id')
test = pd.read_csv(base_path + 'test.csv', index_col='id')

##### 변수 설정

In [None]:
X = train.drop(columns=['defects'])
y = train['defects']
X_test = test

# 모델 학습

### RandomForest

##### 기본 모델

In [None]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=61, shuffle=True, stratify=y)
# model = RandomForestClassifier(random_state=61)
# model.fit(X_train, y_train)

In [None]:
# y_proba_randomforest_basic = model.predict_proba(X_val)[:, 1]
# roc_auc_score(y_val, y_proba_randomforest_basic)

##### hyper-parameter tuning

In [None]:
import os
os.cpu_count()

8

In [None]:
def optimizer(trial):

    n_estimators = trial.suggest_categorical('n_estimators', [100, 150, 200])  # 변경!! 500까지
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_depth = trial.suggest_int('max_depth', 11, 20)
    max_features = trial.suggest_float('max_features', 0.5, 0.8)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 50)
    # min_impurity_decrease = trial.suggest_float('min_impurity_decrease', 0.0001, 0.001)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        min_samples_leaf=min_samples_leaf,
        max_depth=max_depth,
        max_features=max_features,
        min_samples_split=min_samples_split,
        # min_impurity_decrease=min_impurity_decrease,
        random_state=61,
        # n_jobs=-1,
        n_jobs=os.cpu_count()-1,
    )

    folds = StratifiedKFold() # RandomForest는 shuffle이 필요 없음.
    scores = []
    for train_idx, val_idx in folds.split(X, y):
        model.fit(X.iloc[train_idx], y.iloc[train_idx])

        y_true = y.iloc[val_idx]
        y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
        scores.append(roc_auc_score(y_true, y_proba))
    return np.mean(scores)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(optimizer, n_trials=100)

[I 2023-10-13 08:27:00,965] A new study created in memory with name: no-name-eed49c60-f01a-47e5-a746-97b23abbbd36
[I 2023-10-13 08:27:50,319] Trial 0 finished with value: 0.7913958556736717 and parameters: {'n_estimators': 150, 'min_samples_leaf': 9, 'max_features': 0.623060346039235, 'min_samples_split': 32}. Best is trial 0 with value: 0.7913958556736717.
[I 2023-10-13 08:28:23,662] Trial 1 finished with value: 0.7910613231096709 and parameters: {'n_estimators': 100, 'min_samples_leaf': 1, 'max_features': 0.6322167212783414, 'min_samples_split': 37}. Best is trial 0 with value: 0.7913958556736717.
[I 2023-10-13 08:29:03,258] Trial 2 finished with value: 0.7912996432715177 and parameters: {'n_estimators': 100, 'min_samples_leaf': 15, 'max_features': 0.77146303593656, 'min_samples_split': 15}. Best is trial 0 with value: 0.7913958556736717.
[I 2023-10-13 08:29:45,067] Trial 3 finished with value: 0.7912818272155407 and parameters: {'n_estimators': 150, 'min_samples_leaf': 3, 'max_featu

In [None]:
print("Best roc_auc_score: %.4f" % study.best_value)
print("Best params: ", study.best_trial.params) # best score일 때의 하이퍼파라미터들

Best roc_auc_score: 0.7916
Best params:  {'n_estimators': 200, 'min_samples_leaf': 12, 'max_features': 0.6014686638877126, 'min_samples_split': 24}


##### best model

In [None]:
model_best = RandomForestClassifier(**study.best_trial.params, random_state=61)
model_best.fit(X, y)

In [None]:
y_proba = model_best.predict_proba(X_test)[:, 1]
y_proba

array([0.25851054, 0.14009606, 0.62245951, ..., 0.16399582, 0.10925127,
       0.90157781])

##### 분석

In [None]:
# optuna가 시도했던 모든 실험 관련 데이터
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_features,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.791396,2023-10-13 08:27:00.967904,2023-10-13 08:27:50.318899,0 days 00:00:49.350995,0.623060,9,32,150,COMPLETE
1,1,0.791061,2023-10-13 08:27:50.320856,2023-10-13 08:28:23.661848,0 days 00:00:33.340992,0.632217,1,37,100,COMPLETE
2,2,0.791300,2023-10-13 08:28:23.663686,2023-10-13 08:29:03.257667,0 days 00:00:39.593981,0.771463,15,15,100,COMPLETE
3,3,0.791282,2023-10-13 08:29:03.259769,2023-10-13 08:29:45.067203,0 days 00:00:41.807434,0.550841,3,6,150,COMPLETE
4,4,0.791148,2023-10-13 08:29:45.069178,2023-10-13 08:30:21.724783,0 days 00:00:36.655605,0.724363,8,41,100,COMPLETE
...,...,...,...,...,...,...,...,...,...,...
95,95,0.791586,2023-10-13 09:48:31.074832,2023-10-13 09:49:26.430232,0 days 00:00:55.355400,0.604497,12,22,200,COMPLETE
96,96,0.791586,2023-10-13 09:49:26.432461,2023-10-13 09:50:21.928075,0 days 00:00:55.495614,0.609237,12,23,200,COMPLETE
97,97,0.791586,2023-10-13 09:50:21.929925,2023-10-13 09:51:17.285942,0 days 00:00:55.356017,0.609631,12,23,200,COMPLETE
98,98,0.791586,2023-10-13 09:51:17.287735,2023-10-13 09:52:12.966721,0 days 00:00:55.678986,0.612970,12,22,200,COMPLETE


In [None]:
study.trials_dataframe().sum()





number                                                                   4950
value                                                               79.145498
duration                                               0 days 01:26:07.320176
params_max_features                                                 59.416866
params_min_samples_leaf                                                  1425
params_min_samples_split                                                 2276
params_n_estimators                                                     18400
state                       COMPLETECOMPLETECOMPLETECOMPLETECOMPLETECOMPLE...
dtype: object

In [None]:
# 실험 기록 시각화
optuna.visualization.plot_optimization_history(study)

In [None]:
# hyper-parameter들의 중요도
optuna.visualization.plot_param_importances(study)

# 제출

In [None]:
submission_df['defects'] = y_proba
submission_df.to_csv(base_path + 'submission_randomforest.csv')
submission_df

Unnamed: 0_level_0,defects
id,Unnamed: 1_level_1
101763,0.258511
101764,0.140096
101765,0.622460
101766,0.479302
101767,0.178362
...,...
169600,0.209872
169601,0.118576
169602,0.163996
169603,0.109251


In [None]:
base_path

'/content/drive/Othercomputers/내 컴퓨터/5_ML_Project/dulee/'