In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from catboost import CatBoostClassifier

In [2]:
X = pd.read_csv('train.csv', index_col = 'id')
# test_df = pd.read_csv('test.csv', index_col = 'id')

In [3]:
feature_cols = [c for c in X.columns if c not in ('claim', 'fold')]
pipeline = Pipeline([('impute', SimpleImputer(strategy='mean')), ('scale', StandardScaler())])

y = X.claim
X = X[feature_cols]
X = pd.DataFrame(columns= feature_cols, data=pipeline.fit_transform(X))
# xtest = pd.DataFrame(columns= feature_cols, data=pipeline.transform(xtest))

In [4]:
# create trial function
OPTUNA_OPTIMIZATION = True

def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1)
    
    params = {
        'iterations':trial.suggest_int("iterations", 1000, 20000),
        'objective': trial.suggest_categorical('objective', ['Logloss', 'CrossEntropy']),
        'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']),
        'od_wait':trial.suggest_int('od_wait', 500, 2000),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1,15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'verbose': False,
        'task_type' : 'GPU',
        'devices' : '0'
    }
    
    if params['bootstrap_type'] == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif params['bootstrap_type'] == 'Bernoulli':
        params['subsample'] = trial.suggest_float('subsample', 0.1, 1)
    
    model = CatBoostClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_test,y_test)],
        early_stopping_rounds=100,
        use_best_model=True
    )
    
    # validation prediction
    y_hat = model.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, y_hat)
    score = auc(fpr, tpr)
    
    return score

In [None]:
#create optuna study
study = optuna.create_study(direction='maximize', study_name='CatbClf')
study.optimize(objective, n_trials=200)

[32m[I 2021-12-29 17:04:03,264][0m A new study created in memory with name: CatbClf[0m
[32m[I 2021-12-29 17:04:15,782][0m Trial 0 finished with value: 0.5263278967217563 and parameters: {'iterations': 1701, 'objective': 'Logloss', 'bootstrap_type': 'Bayesian', 'od_wait': 1864, 'learning_rate': 0.4678946401545259, 'reg_lambda': 29.982880815593013, 'random_strength': 23.2854965835295, 'depth': 11, 'min_data_in_leaf': 27, 'leaf_estimation_iterations': 14, 'bagging_temperature': 9.713990390095578}. Best is trial 0 with value: 0.5263278967217563.[0m
[32m[I 2021-12-29 17:04:54,847][0m Trial 1 finished with value: 0.6094515282652326 and parameters: {'iterations': 9101, 'objective': 'CrossEntropy', 'bootstrap_type': 'MVS', 'od_wait': 1209, 'learning_rate': 0.26236758364063395, 'reg_lambda': 57.95832074784837, 'random_strength': 33.507633417531444, 'depth': 14, 'min_data_in_leaf': 18, 'leaf_estimation_iterations': 13}. Best is trial 1 with value: 0.6094515282652326.[0m
[32m[I 2021-12-

[32m[I 2021-12-29 17:15:57,256][0m Trial 18 finished with value: 0.796216300122328 and parameters: {'iterations': 3876, 'objective': 'Logloss', 'bootstrap_type': 'Bernoulli', 'od_wait': 1701, 'learning_rate': 0.23959104035201506, 'reg_lambda': 42.2923224674158, 'random_strength': 34.001712550700674, 'depth': 4, 'min_data_in_leaf': 30, 'leaf_estimation_iterations': 1, 'subsample': 0.34840371742934517}. Best is trial 12 with value: 0.7978938887584122.[0m
[32m[I 2021-12-29 17:16:06,360][0m Trial 19 finished with value: 0.7933828700578697 and parameters: {'iterations': 11815, 'objective': 'CrossEntropy', 'bootstrap_type': 'Bernoulli', 'od_wait': 1044, 'learning_rate': 0.3764604617890267, 'reg_lambda': 69.21657654254551, 'random_strength': 10.382386699462, 'depth': 7, 'min_data_in_leaf': 19, 'leaf_estimation_iterations': 8, 'subsample': 0.6848518384263221}. Best is trial 12 with value: 0.7978938887584122.[0m
[32m[I 2021-12-29 17:16:15,328][0m Trial 20 finished with value: 0.64854444

In [None]:
print(f"Best Trial: {study.best_trial.value}")
print(f"Best Params: {study.best_trial.params}")