## Import

In [19]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

## Data Load

In [20]:
data = pd.read_csv('./train.csv')

# person_id 컬럼 제거
X_train = data.drop(['person_id', 'login'], axis=1)
y_train = data['login']

## Hyperparameters Search

In [21]:


# # GridSearchCV를 위한 하이퍼파라미터 설정
# param_search_space = {
#     'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [None, 3, 5, 7, 10, 15, 20, 30],
#     'min_samples_split': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
#     'min_samples_leaf': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
#     'min_weight_fraction_leaf': [0, 0.1, 0.2, 0.3, 0.4, 0.5],
#     'max_features': ['auto', 'sqrt', 'log2'],
#     'max_leaf_nodes': [None, 3, 5, 7, 10, 15],
#     'min_impurity_decrease': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
#     'bootstrap': [True, False]
# }

# # RandomForestClassifier 객체 생성
# rf = RandomForestClassifier(random_state=42)

# # GridSearchCV 객체 생성
# grid_search = GridSearchCV(estimator=rf, param_grid=param_search_space, cv=3, n_jobs=-1, verbose=2, scoring='roc_auc')

# # GridSearchCV를 사용한 학습
# grid_search.fit(X_train, y_train)

# # 최적의 파라미터와 최고 점수 출력
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# best_params, best_score

In [22]:
# n_estimators,criterion,max_depth,min_samples_split,min_samples_leaf,min_weight_fraction_leaf,max_features,max_leaf_nodes,min_impurity_decrease,bootstrap
# n_estimators : 10 ~ 1000 사이의 양의 정수
# criterion :  'gini', 'entropy'. 'gini'는 진니 불순도를, 'entropy'는 정보 이득
# max_depth : None 또는 양의 정수
# min_samples_split : 2 이상의 정수 또는 0과 1 사이의 실수
# min_samples_leaf : 1 이상의 정수 또는 0과 0.5 사이의 실수
# min_weight_fraction_leaf :  0.0에서 0.5 사이의 실수
# max_features :  'auto', 'sqrt', 'log2' 또는 양의 정수/실수
# max_leaf_nodes :  None 또는 양의 정수
# min_impurity_decrease : 0.0 이상의 실수
# bootstrap : True, False

In [23]:
import optuna
from sklearn.pipeline import make_pipeline

# Valid parameters are: ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 
#                        'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 
#                        'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].

def objective(trial):
    params = {
        "n_estimators": [trial.suggest_int("n_estimators",10,1000)],
        "criterion": [trial.suggest_categorical("criterion", ["gini", "entropy"])],
        "max_depth": [trial.suggest_categorical("max_depth", [None] + list(range(4, 31)))],
        "min_samples_split": [trial.suggest_int("min_samples_split", 2, 50)],
        "min_samples_leaf": [trial.suggest_int("min_samples_leaf", 1, 50)],
        "min_weight_fraction_leaf": [trial.suggest_float("min_weight_fraction_leaf", 0.0, 0.5)],
        "max_features": [trial.suggest_categorical("max_features", [None, "sqrt", "log2"])],
        "max_leaf_nodes": [trial.suggest_categorical("max_leaf_nodes", [None] + list(range(2, 101)))],
        "min_impurity_decrease": [trial.suggest_float("min_impurity_decrease", 0.0, 1.0)],
        "bootstrap": [trial.suggest_categorical("bootstrap", [True, False])],
        "verbose": [0],
    }
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf, cv=3, n_jobs=-1, param_grid=params, scoring='roc_auc')
    grid_search.fit(X_train, y_train)
    # 최적의 파라미터와 최고 점수 출력
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    return best_score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=1000)

best_params = study.best_params

[I 2024-03-22 15:56:59,967] A new study created in memory with name: no-name-7ee2a714-7dc0-4a7f-94dd-738839e770f9
[I 2024-03-22 15:57:02,893] Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 964, 'criterion': 'gini', 'max_depth': 17, 'min_samples_split': 17, 'min_samples_leaf': 41, 'min_weight_fraction_leaf': 0.10291266165116075, 'max_features': 'sqrt', 'max_leaf_nodes': 65, 'min_impurity_decrease': 0.3530694527316477, 'bootstrap': False}. Best is trial 0 with value: 0.5.
[I 2024-03-22 15:57:06,016] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 886, 'criterion': 'entropy', 'max_depth': 13, 'min_samples_split': 33, 'min_samples_leaf': 15, 'min_weight_fraction_leaf': 0.4880630549972343, 'max_features': 'log2', 'max_leaf_nodes': 73, 'min_impurity_decrease': 0.937761581286794, 'bootstrap': True}. Best is trial 0 with value: 0.5.
[I 2024-03-22 15:57:07,069] Trial 2 finished with value: 0.6023301073006522 and parameters: {'n_estimators': 166, 'criterion':

In [None]:
study.best_value

0.7757632076812451

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

# 찾은 최적의 파라미터들을 제출 양식에 맞게 제출
for param, value in best_params.items():
    if param in submit.columns:
        submit[param] = value
        print("%s : %s" %(param,value))

submit.to_csv('./baseline_submit.csv', index=False)

n_estimators : 324
criterion : gini
max_depth : 7
min_samples_split : 28
min_samples_leaf : 10
min_weight_fraction_leaf : 0.15922281163856353
max_features : None
max_leaf_nodes : 39
min_impurity_decrease : 0.0006763579332763601
bootstrap : True
