In [None]:
import pandas as pd
import numpy as np
import random
import time
import os

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from tqdm import tqdm

from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMClassifier, LGBMRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import StackingRegressor, StackingClassifier

import optuna
from optuna.samplers import TPESampler

import gc

In [None]:
SEED = 2021
TARGET = "target"

N_SPLITS = 10
N_ESTIMATORS=40000

LOSS = 'CrossEntropy'
EVAL_METRIC = "AUC"

In [None]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

In [None]:
df_train = pd.read_csv(r"../input/tabular-playground-series-oct-2021/train.csv", index_col=0)
df_test = pd.read_csv(r"../input/tabular-playground-series-oct-2021/test.csv", index_col=0)

X = df_train.drop(TARGET, axis=1)
y = df_train[[TARGET]]
X_test = df_test

X_index = X.index
X_test_index = X_test.index

del df_train, df_test
gc.collect()

In [None]:
def run_kfold( model, n_splits=5, test_data=None):
    kf = KFold(n_splits=n_splits, random_state=SEED, shuffle=True)
    
    scores = np.empty((n_splits,1))
    n_trees = np.empty((n_splits,1))
    
    y_preds = np.empty((len(test_data), n_splits))
    y_oof =  np.empty((len(X), 1))
    
    for i_fold,(train_idx, val_idx) in enumerate(kf.split(X)):
        print(5*"=" + f" Fold {i_fold} " + 5*"=")
        X_train = X.iloc[train_idx,:]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx,:]
        y_val = y.iloc[val_idx]
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=1000, early_stopping_rounds = 1000)

        fold_oof = model.predict_proba(X_val)[:,1].reshape((-1,1))
        y_oof[val_idx,:] = fold_oof
        
        fold_score = roc_auc_score(y_val, fold_oof)
        scores[i_fold, 0] = fold_score
        print(f"*** Fold {i_fold} score :", fold_score, " ***")
        
        n_trees[i_fold, 0] = model.get_best_iteration()
        
        if test_data is not None :
            y_preds[:,i_fold] = model.predict_proba(test_data)[:,1]
    
        gc.collect()
        
    print('N trees : ', int(np.median(n_trees)))
    print('CV auc scores: ',scores.mean(), " +/- ",  scores.std())
    return scores, y_preds, y_oof

In [None]:
best_param = {'depth': 3,
 'l2_leaf_reg': 58.74937218498706,
 'leaf_estimation_iterations': 2,
 'learning_rate': 0.050008320651555346,
 'min_data_in_leaf': 30,
 'random_strength': 10.217105702928265,
 'subsample': 0.3662243059636376}

In [None]:
cat_param = best_param

print('Best parameters :')
display(cat_param)

model = CatBoostClassifier(
    iterations= N_ESTIMATORS,
    grow_policy='Depthwise',
    leaf_estimation_method='Newton', 
    bootstrap_type='Bernoulli',
    loss_function= LOSS,
    eval_metric= EVAL_METRIC,
    task_type='GPU',
    silent=True,
    random_seed = SEED,
    **cat_param
)

In [None]:
scores, y_preds, y_oofs = run_kfold(model, n_splits=8, test_data=X_test)

In [None]:
## save submission
pd.DataFrame(y_oofs, index=X.index, columns= [TARGET]).to_parquet("cat_oof.parquet")
pd.DataFrame(y_preds.mean(axis=1), index=X_test.index, columns= [TARGET]).to_csv("cat_submission.csv")