In [None]:
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import pickle

In [None]:
data = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv', index_col='id')
study_file = open('../input/catboost-tuning/cb_optimizing_study.pickle', 'rb')
study = pickle.load(study_file)
study_file.close()

In [None]:
data['n_missing'] = data.isna().sum(axis=1)

In [None]:
X, y = data.drop('claim', axis=1), data['claim']

In [None]:
def objective(trial, X, y, n_splits, random_state=42):
    skf = StratifiedKFold(n_splits, shuffle=True, random_state=random_state)
    params = {
        'objective': 'Logloss',
        'eval_metric': 'AUC',
        'n_estimators': 20000,
        'learning_rate': 0.1,
        'random_state': random_state,
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10),
        'bootstrap_type': 'Bernoulli',
        'subsample': trial.suggest_uniform('subsample', 0.2, 1),
        'sampling_frequency': 'PerTree',
        'use_best_model': True,
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 256),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.1, 1),
        'od_type': 'Iter',
        'early_stopping_rounds': 150,
        'logging_level': 'Silent'
    }
        
    oof = np.empty_like(y, dtype='float64')
    for train_idx, val_idx in skf.split(X, y):
        model = CatBoostClassifier(**params)
        X_train = X.loc[train_idx, :]
        y_train = y[train_idx]
        X_val = X.loc[val_idx, :]
        y_val = y[val_idx]
        model.fit(X_train, y_train, eval_set=(X_val, y_val))
        oof_pred = model.predict_proba(X_val)[:, 1]
        oof[val_idx] = oof_pred
    val_score = roc_auc_score(y, oof)
    return val_score

In [None]:
timeout = 60*60*6
n_splits = 5
random_state = 42
n_trials = None

study.optimize(lambda trial: objective(trial, X, y, n_splits, random_state),
               n_trials=n_trials, timeout=timeout)

In [None]:
pickle.dump(study, open('cb_optimizing_study.pickle', 'wb'))

In [None]:
def get_params(study):   
    params = study.best_params
    params['objective'] = 'Logloss'
    params['learning_rate'] = 0.1
    params['eval_metric'] = 'AUC'
    params['n_estimators'] = 20000
    params['bootstrap_type'] = 'Bernoulli'
    params['logging_level'] = 'Silent'
    params['sampling_frequency'] = 'PerTree'
    params['use_best_model'] = True
    params['od_type'] = 'Iter'
    params['early_stopping_rounds'] = 150
    params['random_state'] = random_state
    return params

In [None]:
params = get_params(study)

In [None]:
print('CatBoost best params:')
print(params)