The purpose of the study is to check how well catboost handles unprepared data. In a situation where you need to act quickly, this can help.
Accelerator GPU is ON

In [None]:
import pandas as pd
import numpy as np
import datatable as dt
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn import linear_model
from sklearn import decomposition
from sklearn import preprocessing
import optuna
import gc

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')

print('Training data: ')
train.head()

In [None]:
TARGET = 'target'
FEATURES = [col for col in train.columns if col not in ['id', TARGET]]

In [None]:
%%time
train["mean"] = train[FEATURES].mean(axis=1)
train["std"] = train[FEATURES].std(axis=1)
train["min"] = train[FEATURES].min(axis=1)
train["max"] = train[FEATURES].max(axis=1)

test["mean"] = test[FEATURES].mean(axis=1)
test["std"] = test[FEATURES].std(axis=1)
test["min"] = test[FEATURES].min(axis=1)
test["max"] = test[FEATURES].max(axis=1)

FEATURES.extend(['mean', 'std', 'min', 'max'])

In [None]:
%%time
scaler = preprocessing.StandardScaler()
for col in FEATURES:
    train[col] = scaler.fit_transform(train[col].to_numpy().reshape(-1,1))
    test[col] = scaler.transform(test[col].to_numpy().reshape(-1,1))
    
X = train[FEATURES].to_numpy().astype(np.float32)
y = train[TARGET].to_numpy().astype(np.float32)
X_test = test[FEATURES].to_numpy().astype(np.float32)

del train, test
gc.collect()

In [None]:
SEED = 45

In [None]:
# I did not optimize the parameters, I borrowed it, thanks to those who worked
# I added only early_stopping_rounds myself
params = {'iterations': 10000,          
          'objective': 'CrossEntropy',
          'bootstrap_type': 'Bernoulli',
          'learning_rate': 0.023575206684596582, # 0.05071417780137978
          'reg_lambda': 36.30433203563295,
          'random_strength': 43.75597655616195,
          'depth': 8,
          'eval_metric' : 'AUC',
          'min_data_in_leaf': 12,
          'leaf_estimation_iterations': 1,
          'subsample': 0.8227911142845009,
           'task_type': 'GPU',
           'devices': '0'
         }

In [None]:
%%time

preds = []
scores = []

for fold, (idx_train, idx_valid) in enumerate(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(X, y)):
    X_train, y_train = X[idx_train], y[idx_train]
    X_valid, y_valid = X[idx_valid], y[idx_valid]

    model = CatBoostClassifier(**params)

    model.fit(X_train,y_train,
              eval_set=[(X_valid,y_valid)],
              early_stopping_rounds=2000,
              verbose=False)

    pred_valid = model.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)

    print(f"Seed: {SEED} Fold: {fold + 1} Score: {score}" "\n")

    test_preds = model.predict_proba(X_test)[:,1]
    preds.append(test_preds)
    
print(f"Overall Validation Score: {np.mean(scores)}")

In [None]:
predictions = np.mean(np.column_stack(preds),axis=1)
sample_submission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
sample_submission['target'] = predictions
sample_submission.to_csv('./ss_catboostkf.csv', index=False)
sample_submission.head()

The result looks good. Catboost is a great time saver!