In [10]:
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import KFold
from sklearn.utils.class_weight import compute_class_weight

In [11]:
with open('data/X_train.npy', 'rb') as f:
    X_train = np.load(f)

In [12]:
path_y_train = 'data/y_train.csv'

y_train = pd.read_csv(path_y_train, index_col=0)
y_train = y_train.values.flatten()

In [13]:
from sklearn.metrics import roc_auc_score


def gini(auc_score):
    return 2 * auc_score - 1

In [14]:
class_weights = compute_class_weight(class_weight="balanced",
                                     classes=np.unique(y_train),
                                     y=y_train)

In [15]:
# kf = KFold(n_splits=5, shuffle=True)
# for train_index, test_index in kf.split(X_train):
#     x_train_fold, x_test_fold = X_train[train_index], X_train[test_index]
    
#     print(x_train_fold.shape, x_test_fold.shape)

In [16]:
# from sklearn.model_selection import train_test_split


# X_train_preprocess2, X_valid_preprocess2, y_train_preprocess2, y_valid_preprocess2 =\
#       train_test_split(X_train, y_train.values, test_size=0.33, random_state=100)

In [17]:
from catboost import CatBoostClassifier
import optuna


def objective(trial):
    params = {'iterations': 10000,
              'loss_function': 'Logloss',
            #   'score_function': 'NewtonL2',
              'depth': trial.suggest_int("depth", 4, 16),
              'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 0.0001, 25, log=True),
              'bagging_temperature': trial.suggest_float("bagging_temperature", 0, 10),
              'auto_class_weights':trial.suggest_categorical('auto_class_weights', [None,'Balanced','SqrtBalanced']),
            #   'grow_policy': 'Lossguide',
              'grow_policy': trial.suggest_categorical('grow_policy',['SymmetricTree','Depthwise','Lossguide']), 
              'early_stopping_rounds':200,
            #   'eval_metric':'AUC',   # NormalizedGini
              'bootstrap_type':'Bayesian',
              'use_best_model':True,
              'task_type':'GPU', 
              'verbose':False,
              'border_count':254              
             }
                 
    if params['grow_policy'] in ['Depthwise','Lossguide']:
       params['min_data_in_leaf'] = trial.suggest_int("min_data_in_leaf", 1, 5000, log=True)
    if params['grow_policy'] in ['Lossguide']:
       params['max_leaves'] = trial.suggest_int("max_leaves", 1, 64)
    
    cbc = CatBoostClassifier(**params)
    kf = KFold(n_splits=5, shuffle=True)
    gini_test = []
    for train_index, test_index in kf.split(X_train):
        x_train_fold, x_test_fold = X_train[train_index], X_train[test_index]
        y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

        cbc.fit(x_train_fold, y_train_fold,
                eval_set=(x_test_fold, y_test_fold))

        # pred_proba = cbc.predict_proba(x_test_fold)[:,1]
        pred = cbc.predict(x_test_fold)
        gini_test.append(gini(roc_auc_score(y_test_fold, pred)))

    return np.mean(gini_test)

In [18]:
# params = {'iterations': 10000,
#           'learning_rate': 0.0034,
#           'depth': 12,
#           'loss_function': 'Logloss',
#           'eval_metric': 'NormalizedGini', # 'AUC'
#           'score_function': 'NewtonL2', # 'Cosine', 'L2', 'NewtonCosine', 'NewtonL2'
#           # 'bagging_temperature': 1,
#           # 'scale_pos_weight': 10,
#           'auto_class_weights': 'Balanced', # 'SqrtBalanced'
#           'grow_policy': 'SymmetricTree',   # 'Depthwise', 'Lossguide', 'SymmetricTree'
#           'bootstrap_type': 'Bernoulli', # 'Bernoulli', 'Bayesian'
#           'early_stopping_rounds': 200,
#         #   'cat_features': cat_features,
#           'task_type': 'GPU',
#           'border_count': 254,  # 254
#           'verbose': 200,
#           'random_seed': 0,
#         #   'ignored_features': ignored_features
#          }

In [19]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=2*60*60)  # 2h
print(study.best_trial)

[I 2023-08-16 15:09:23,133] A new study created in memory with name: no-name-3daddd9f-a33e-40d8-ac84-1f7eab0268c7
[I 2023-08-16 15:52:44,345] Trial 0 finished with value: 0.32923402496918686 and parameters: {'depth': 7, 'l2_leaf_reg': 0.011294834927782476, 'bagging_temperature': 9.297813770415363, 'auto_class_weights': 'Balanced', 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'max_leaves': 41}. Best is trial 0 with value: 0.32923402496918686.
[I 2023-08-16 16:22:10,097] Trial 1 finished with value: 0.07344483924391429 and parameters: {'depth': 12, 'l2_leaf_reg': 0.0784551864476511, 'bagging_temperature': 7.211219257274035, 'auto_class_weights': None, 'grow_policy': 'Depthwise', 'min_data_in_leaf': 184}. Best is trial 0 with value: 0.32923402496918686.
[I 2023-08-16 16:32:18,204] Trial 2 finished with value: 0.08031177223321614 and parameters: {'depth': 12, 'l2_leaf_reg': 9.892701729160555, 'bagging_temperature': 1.1715681227390884, 'auto_class_weights': None, 'grow_policy': 'Dept

FrozenTrial(number=0, state=TrialState.COMPLETE, values=[0.32923402496918686], datetime_start=datetime.datetime(2023, 8, 16, 15, 9, 23, 135515), datetime_complete=datetime.datetime(2023, 8, 16, 15, 52, 44, 345822), params={'depth': 7, 'l2_leaf_reg': 0.011294834927782476, 'bagging_temperature': 9.297813770415363, 'auto_class_weights': 'Balanced', 'grow_policy': 'Lossguide', 'min_data_in_leaf': 10, 'max_leaves': 41}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'depth': IntDistribution(high=16, log=False, low=4, step=1), 'l2_leaf_reg': FloatDistribution(high=25.0, log=True, low=0.0001, step=None), 'bagging_temperature': FloatDistribution(high=10.0, log=False, low=0.0, step=None), 'auto_class_weights': CategoricalDistribution(choices=(None, 'Balanced', 'SqrtBalanced')), 'grow_policy': CategoricalDistribution(choices=('SymmetricTree', 'Depthwise', 'Lossguide')), 'min_data_in_leaf': IntDistribution(high=5000, log=True, low=1, step=1), 'max_leaves': IntDistribution(

In [20]:
study.best_params

{'depth': 7,
 'l2_leaf_reg': 0.011294834927782476,
 'bagging_temperature': 9.297813770415363,
 'auto_class_weights': 'Balanced',
 'grow_policy': 'Lossguide',
 'min_data_in_leaf': 10,
 'max_leaves': 41}

In [21]:
study.best_value

0.32923402496918686

In [22]:
len(study.trials)

6

In [23]:
from optuna.visualization import plot_optimization_history, plot_param_importances
plot_optimization_history(study)

In [24]:
plot_param_importances(study)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import (accuracy_score , f1_score , precision_score , recall_score)


print(classification_report(y_valid_preprocess2.flatten() , pred))

In [None]:
print("Accuracy:", accuracy_score(y_valid_preprocess2.flatten() , pred))
print("Precision:", precision_score(y_valid_preprocess2.flatten(), pred))
print("Recall:", recall_score(y_valid_preprocess2.flatten(), pred))
print("F1-score:", f1_score(y_valid_preprocess2.flatten(), pred))