In [None]:
import numpy as np
import pandas as pd
import catboost as ctb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
sub = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')

In [None]:
cat_features = [col for col in train.columns if 'cat' in col]
cont_features = [col for col in train.columns if 'cont' in col]
all_features = cat_features + cont_features

# Label Encoder

In [None]:
all_df = pd.concat([train , test]).reset_index(drop = True)

le = LabelEncoder()
for col in cat_features:
    all_df[col] = le.fit_transform(all_df[col])
    
train = all_df[:train.shape[0]]
test = all_df[train.shape[0]:].reset_index(drop =True)

In [None]:
N_FOLDS = 10
SEED = 2021
EARLY_STOPPING_ROUNDS = 100
VERBOSE = 1000

params = {
    'bootstrap_type' : 'Poisson',
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'random_seed' : SEED,
    'task_type' : 'GPU',
    'max_depth' : 8,
    'learning_rate' : 0.01,
    'n_estimators' : 20000,
    'max_bin' : 280,
    'min_data_in_leaf' : 64,
    'l2_leaf_reg' : 0.01,
    'subsample' : 0.2
    
}

In [None]:
oof = np.zeros(train.shape[0])
pred = 0
skf = StratifiedKFold(n_splits = N_FOLDS , shuffle = True , random_state = SEED)

data = train[all_features]
target = train['target']

for fold , (trn_idx , val_idx) in enumerate(skf.split(data , target)):
    print(f'========FOLD{fold}========')
    
    train_x = data.iloc[trn_idx]
    train_y = target.iloc[trn_idx]
    val_x = data.iloc[val_idx]
    val_y = target.iloc[val_idx]
    
    model = ctb.CatBoostClassifier(**params)
    model.fit(train_x , train_y,
             eval_set = [(val_x,val_y)],
             use_best_model = True,
             early_stopping_rounds = EARLY_STOPPING_ROUNDS,
             verbose = VERBOSE
             )
    oof[val_idx] = model.predict_proba(val_x)[: , 1]
    pred += model.predict_proba(test[all_features])[: , 1]/N_FOLDS
    score = roc_auc_score(val_y , oof[val_idx] , average = 'micro')
    print(f'FOLD {fold} AUC {score}\n')
    
score = roc_auc_score(target , oof , average = 'micro')
print(f'AUC {score}\n')

In [None]:
sub['target'] = pred
sub.to_csv('ctbsubmission.csv' , index = False)
sub

In [None]:
oof_pred = pd.DataFrame({'id' : train['id'] , 'target' : oof})
oof_pred.to_csv('ctboof_predict.csv' , index = False)
oof_pred