In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
import optuna
import gc

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv').drop('id', axis=1)
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv').drop('id', axis=1)
ss = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
train.head()

In [None]:
X = train.drop('target', axis=1).copy()
y = train['target'].copy()
X_test = test.copy()

del train
gc.collect()
del test
gc.collect()

In [None]:
scaler = StandardScaler()

X = pd.DataFrame(columns=X.columns, data=scaler.fit_transform(X))
X_test = pd.DataFrame(columns=X_test.columns, data=scaler.transform(X_test))

In [None]:
params = {'iterations': 3000,
           'max_depth': 7,
           'objective': 'Logloss',
           'bootstrap_type': 'Bernoulli',
           'od_wait': 842,
           'learning_rate': 0.052154791912163885,
           'reg_lambda': 5614.292946027834,
           'random_strength': 17.247582155465118,
           'min_data_in_leaf': 8,
           'leaf_estimation_iterations': 12,
           'subsample': 0.683983017651315}

In [None]:
%%time
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=786)

preds = []
scores = []

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    model = CatBoostClassifier(**params,
                               task_type='GPU')
    
    model.fit(X_train,y_train,
              eval_set=[(X_valid,y_valid)],
              early_stopping_rounds=100,
              verbose=False)
    
    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}" "\n")
    print('||'*40, "\n")
    
    test_preds = model.predict_proba(X_test)[:,1]
    preds.append(test_preds)
    
print(f"Overall Validation Score: {np.mean(scores)}")

In [None]:
predictions = np.mean(np.column_stack(preds),axis=1)

ss['target'] = predictions
ss.to_csv('./catboost.csv', index=False)
ss.head()