A very simple notebook as a baseline

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.cluster import KMeans

import optuna.integration.lightgbm as lgb_opt
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import optuna

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [None]:
# Load data
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
print('Train size:', train.shape)
print('Test size:', test.shape)

print('Train data info: ')
display(train.head())
train.info()

In [None]:
train['target'].value_counts()

### Feature Engineering

In [None]:
# Standardization

cont_feat = [f'cont{i}' for i in range(0, 11)]
cat_feat = [f'cat{i}' for i in range(0, 19)]
original_feat = cat_feat+cont_feat

scaler = StandardScaler()
scaler.fit(train[cont_feat])
train[cont_feat] = scaler.transform(train[cont_feat])
test[cont_feat] = scaler.transform(test[cont_feat])

# concat data
train_size=train.shape[0]
data=pd.concat([train, test], axis=0)

In [None]:
# Label encoding
for c in data.columns:
    if data[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(data[c])
        data[c] = lbl.transform(data[c].values)
        cat_feat.append(c)

display(data.head())

### Splitting data

In [None]:
y_train = train['target'].copy()

use_features=original_feat
data=data[use_features]

X_train=data[:train_size]
X_test=data[train_size: ]
X_train.head()

### Modeling

In [None]:
def run_cb(params):
    y_preds = []
    oof_train = np.zeros((len(X_train),))
    models=[]
    
    for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
        print(f'Fold {fold_id+1}')
        X_tr = X_train.loc[train_index, :]
        X_val = X_train.loc[valid_index, :]
        y_tr = y_train.loc[train_index]
        y_val = y_train.loc[valid_index]

        model = CatBoostClassifier(**params)

        model.fit(X = X_tr, y = y_tr, eval_set = (X_val, y_val), 
                  plot = False, 
                  early_stopping_rounds=10)

        oof_train[valid_index] = model.predict_proba(X_val)[:, 1]
        y_pred = model.predict_proba(X_test)[:, 1]

        y_preds.append(y_pred)
        
        models.append(model)
    
    return oof_train, sum(y_preds) / len(y_preds), models

In [None]:
%%time

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

params = {'iterations': 10000,
          'depth': 4, 
          'learning_rate': 0.05, 
          'loss_function': 'Logloss', 
          'eval_metric': 'AUC', 
          'boosting_type': 'Plain',
          'random_state': 1, 
          'use_best_model': True, 
          'cat_features': cat_feat, 
          'verbose': 100}

oof, y_preds, cb_models = run_cb(params)
print(f'CV = {roc_auc_score(y_train, oof)}')

### CV

In [None]:
print(f'CV = {roc_auc_score(y_train, oof)}')

In [None]:
sub = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
sub['target'] = y_preds
sub.to_csv('submission.csv', index=False)
sub.head()

### Future work
- Try new features
- Hyperparameter tuning