# CatBoost Optuna

## Load data

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
train.head()

In [None]:
cat_cols = [x for x in train.columns if x.startswith('cat')]
cat_cols

In [None]:
from categorical_transform import CategoricalTransform,IntegerCategoricalTransform
ct = IntegerCategoricalTransform(cat_cols)
x_train = ct.fit_transform(train)
x_test = ct.transform(test)

In [None]:
x_train = train.drop(columns=['id','target'])
y_train = train['target']

In [None]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True)

# Optuna optimization

In [None]:
from sklearn.metrics import roc_auc_score
import optuna
import numpy as np
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline

def objective(trial):
    params = {'iterations':10000,
              'depth': trial.suggest_int("depth", 4, 16),
              'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 0.0001, 25, log=True),
              'bagging_temperature': trial.suggest_float("bagging_temperature", 0, 10),
              'auto_class_weights':trial.suggest_categorical('auto_class_weights', [None,'Balanced','SqrtBalanced']),
              'grow_policy': 'Lossguide',
              'early_stopping_rounds':200,
              'eval_metric':'AUC',
              'bootstrap_type':'Bayesian',
              'use_best_model':True,
              'task_type':'GPU', 
              'cat_features':cat_cols,
              'verbose':False,
              'border_count':254              
             }
    #'grow_policy': trial.suggest_categorical('grow_policy',['SymmetricTree','Depthwise','Lossguide']),              
    #if params['grow_policy'] in ['Depthwise','Lossguide']:
    #    params['min_data_in_leaf'] = trial.suggest_int("min_data_in_leaf", 1, 5000, log=True)
    #if params['grow_policy'] in ['Lossguide']:
    #    params['max_leaves'] = trial.suggest_int("max_leaves", 1, 64)
    
    cbc = CatBoostClassifier(**params)
    kf = KFold(n_splits=5, shuffle=True)
    roc_test = []
    for train_index, test_index in kf.split(x_train):
        x_train_fold, x_test_fold = x_train.loc[train_index], x_train.loc[test_index]
        y_train_fold, y_test_fold = y_train.loc[train_index], y_train.loc[test_index]
        cbc.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold, y_test_fold))    
        proba = cbc.predict_proba(x_test_fold)[:,1]
        roc_test.append(roc_auc_score(y_test_fold, proba))
    return np.mean(roc_test)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=5*60*60)
print(study.best_trial)

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
len(study.trials)

In [None]:
from optuna.visualization import plot_optimization_history, plot_param_importances
plot_optimization_history(study)

In [None]:
plot_param_importances(study)