# CatBoost Tabular Playground Prediction(Sep 2021)
## Table of Contents
- Import Packages
- Import Datasets
- EDA & Preprocessing
- Model Development & Evaluation
- Submission

## Import Packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score

## Import datasets

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")

In [None]:
train.head()

In [None]:
test.head()

## EDA & Data Preprocessing

### Drop ID columns

In [None]:
train.pop("id")
test_ids = test.pop("id")

### Get Train data Targets

In [None]:
train_targets = train.pop("claim")

In [None]:
train_targets.head()

The labels looks balanced. 

In [None]:
sns.countplot(train_targets)

Let's see statistic info between train set and test set and compare their differences, which are very small execpt for their total numbers.  Some of the parameters has a great difference, so I will apply log transformation to reduce their skewness.

In [None]:
train_desc = train.describe()
train_desc.transpose()

In [None]:
test_desc = test.describe()
test_desc.transpose()

In [None]:
desc_delta = train_desc - test_desc
desc_delta.transpose()

### Add extra features

In [None]:
for data in [train, test]:
    data['n_nans'] = data.isnull().sum(axis=1)
    data['std'] = data.std(axis=1)
    data['var'] = data.var(axis=1)

In [None]:
train.head()

In [None]:
test.head()

## Feature Scaling

In [None]:
columns = list(train.columns)
for item in train.columns:
    if abs(train[item].max()) / (abs(train[item].min()) + 10e-10) > 20:
        train[item] = np.sign(train[item]) * np.log2(np.abs(train[item]) + 1)
        test[item] = np.sign(test[item]) * np.log2(np.abs(test[item]) + 1)
    train_mean = train[item].mean()
    train_std = train[item].std()
    train[item] = (train[item] - train_mean) / train_std
    test[item] = (test[item] - train_mean) / train_std
    # Missing Value Imputation seems to have a bad effect to final results
    #train[item].replace(np.NAN, train[item].mean(), inplace=True)
    #test[item].replace(np.NAN, test[item].mean(), inplace=True)


## Model Development & Evaluation


### Evaluation Method

In [None]:
def evaluate(valid_targets, probs, name):
    y_pred = np.array(probs > 0.5, dtype=int)
    acc = accuracy_score(valid_targets, y_pred)
    loss = log_loss(valid_targets, y_pred)
    auc = roc_auc_score(valid_targets, probs)
    print("Accuracy score: %.2f"%(acc))
    print("Log loss: %.2f"%(loss))
    print("AUC score:", auc)
    print("Classification report:")
    print(classification_report(valid_targets, y_pred))
    return {
        "name": name, 
        "accuracy_score": acc, 
        "log_loss": loss, 
        "auc": auc
    }

## Using CatBoost

In [None]:
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=10, shuffle=True)
cats = []
index = 1
for train_indices, valid_indices in kfold.split(train, train_targets):
    print("Training with Fold %d"%(index))
    train_features = train.iloc[train_indices]
    train_labels = train_targets.iloc[train_indices]
    valid_features = train.iloc[valid_indices]
    valid_labels = train_targets.iloc[valid_indices]
    cat_params = {
        'iterations': 15000, 
        'loss_function': 'Logloss', 
        'depth': 8, 
        'task_type' : 'GPU',
        'use_best_model': True,
        'eval_metric': 'AUC',
        'early_stopping_rounds': 1000,
        'learning_rate': 0.03,
        'border_count': 32,
        'l2_leaf_reg': 3,
        "verbose": 1000
    }
    cat = CatBoostClassifier(
        **cat_params
    )
    cat.fit(train_features, train_labels, eval_set=[(valid_features, valid_labels)])
    cats.append(cat)
    probs = cat.predict_proba(valid_features)[:, 1]
    result_cat = evaluate(valid_labels, probs, "catboost")
    print(result_cat)
    index += 1

## Submisssion

In [None]:
probs_list = []
for cat in cats:
    probs = cat.predict_proba(test)[:, 1]
    probs_list.append(probs)
probs_array = np.array(probs_list)
mean_probs = probs_array.mean(axis=0)

In [None]:
mean_probs.shape

In [None]:
submission = pd.DataFrame({"id": list(test_ids), "claim": mean_probs})
submission.to_csv("submission.csv", index=False)