In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from catboost import CatBoostClassifier

In [2]:
X = pd.read_csv('train.csv', index_col = 'id')
# test_df = pd.read_csv('test.csv', index_col = 'id')

In [3]:
feature_cols = [c for c in X.columns if c not in ('claim', 'fold')]
pipeline = Pipeline([('impute', SimpleImputer(strategy='mean')), ('scale', StandardScaler())])

y = X.claim
X = X[feature_cols]
X = pd.DataFrame(columns= feature_cols, data=pipeline.fit_transform(X))
# xtest = pd.DataFrame(columns= feature_cols, data=pipeline.transform(xtest))

In [4]:
# create trial function
OPTUNA_OPTIMIZATION = True

def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)
    
    params = {
        'iterations':trial.suggest_int("iterations", 1000, 20000),
        'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'od_wait':trial.suggest_int('od_wait', 500, 2000),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1,15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'verbose': False,
        'task_type' : 'GPU',
        'devices' : '0'
    }
    
    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif params['bootstrap_type'] == 'Bernoulli':
        params['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
    model = CatBoostClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_test,y_test)],
        early_stopping_rounds=100,
        use_best_model=True
    )
    
    # validation prediction
    y_hat = model.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_hat)
    score = auc(fpr, tpr)
    
    return score

In [5]:
#create optuna study
study = optuna.create_study(direction='maximize', study_name='CatbClf')
study.optimize(objective, n_trials=200)

[32m[I 2021-09-13 12:27:17,070][0m A new study created in memory with name: CatbClf[0m
[32m[I 2021-09-13 12:27:57,436][0m Trial 0 finished with value: 0.5157398024784275 and parameters: {'iterations': 2819, 'objective': 'Logloss', 'bootstrap_type': 'Bayesian', 'od_wait': 1265, 'learning_rate': 0.7359773265891371, 'reg_lambda': 69.54556779363972, 'random_strength': 25.932680648080776, 'depth': 14, 'min_data_in_leaf': 23, 'leaf_estimation_iterations': 11, 'bagging_temperature': 5.356327714729467}. Best is trial 0 with value: 0.5157398024784275.[0m
[32m[I 2021-09-13 12:28:07,643][0m Trial 1 finished with value: 0.6977232674790982 and parameters: {'iterations': 6499, 'objective': 'Logloss', 'bootstrap_type': 'Bayesian', 'od_wait': 1233, 'learning_rate': 0.9497811295491526, 'reg_lambda': 69.89371413520327, 'random_strength': 43.96644038703017, 'depth': 9, 'min_data_in_leaf': 12, 'leaf_estimation_iterations': 4, 'bagging_temperature': 4.754457072189099}. Best is trial 1 with value: 0

KeyboardInterrupt: 

In [None]:
print(f"Best Trial: {study.best_trial.value}")
print(f"Best Params: {study.best_trial.params}")