# CatBoost Tabular Playground Prediction(Sep 2021)
## Table of Contents
- Import Packages
- Import Datasets
- EDA & Preprocessing
- Model Development & Evaluation
- Submission

## Import Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score

## Import datasets

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
0,0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1
3,3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1
4,4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,,0.23049,1


In [4]:
test.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
0,957919,0.16585,0.48705,1295.0,0.0231,0.319,0.90188,573.29,3743.7,2705700000000.0,...,0.16253,-22.189,2.0655,0.43088,-10.741,81606.0,1.194,198040000000000.0,2017.1,0.46357
1,957920,0.12965,0.37348,1763.0,0.72884,0.33247,-1.2631,875.55,554370.0,595570000000000.0,...,0.81528,-1.6342,1.5736,-1.0712,11.832,90114.0,1.1507,4.388e+16,6638.9,0.28125
2,957921,0.12019,0.44521,736.26,0.04615,0.29605,0.31665,2659.5,317140.0,397780000000000.0,...,0.81831,-32.78,2.1364,-1.9312,-3.2804,37739.0,1.1548,171810000000000.0,5844.0,0.13797
3,957922,0.054008,0.39596,996.14,0.85934,0.36678,-0.1706,386.56,325680.0,-34322000000000.0,...,0.86559,-2.4162,1.5199,-0.011633,1.384,26849.0,1.149,2.1388e+17,6173.3,0.3291
4,957923,0.079947,-0.006919,10574.0,0.34845,0.45008,-1.842,3027.0,428150.0,929150000000.0,...,0.2519,-18.63,3.7387,0.75708,-4.9405,50336.0,1.2488,2.1513e+17,2250.1,0.33796


## EDA & Data Preprocessing

### Drop ID columns

In [None]:
train.pop("id")
test_ids = test.pop("id")

### Get Train data Targets

In [None]:
train_targets = train.pop("claim")

In [None]:
train_targets.head()

The labels looks balanced. 

In [None]:
sns.countplot(train_targets)

Let's see statistic info between train set and test set and compare their differences, which are very small execpt for their total numbers.  Some of the parameters has a great difference, so I will apply log transformation to reduce their skewness.

In [None]:
train_desc = train.describe()
train_desc.transpose()

In [None]:
test_desc = test.describe()
test_desc.transpose()

In [None]:
desc_delta = train_desc - test_desc
desc_delta.transpose()

## Feature Scaling

In [None]:
for item in train.columns:
    if abs(train[item].max()) / (abs(train[item].min()) + 10e-10) > 10:
        train[item] = np.sign(train[item]) * np.log2(np.abs(train[item]) + 1)
        test[item] = np.sign(test[item]) * np.log2(np.abs(test[item]) + 1)
    train_mean = train[item].mean()
    train_std = train[item].std()
    train[item] = (train[item] - train_mean) / train_std
    test[item] = (test[item] - train_mean) / train_std
    # Missing Value Imputation seems to have a bad effect to final results
    #train[item].replace(np.NAN, train[item].mean(), inplace=True)
    #test[item].replace(np.NAN, test[item].mean(), inplace=True)


### Train Validation Split

In [None]:
train_features, valid_features, train_targets, valid_targets = train_test_split(train, train_targets, test_size=0.2, random_state=np.random.randint(1000))
train_features.shape, train_targets.shape, valid_features.shape, valid_targets.shape

## Model Development & Evaluation


### Evaluation Method

In [None]:
def evaluate(valid_targets, probs, name):
    y_pred = np.array(probs > 0.5, dtype=int)
    acc = accuracy_score(valid_targets, y_pred)
    loss = log_loss(valid_targets, y_pred)
    auc = roc_auc_score(valid_targets, probs)
    print("Accuracy score: %.2f"%(acc))
    print("Log loss: %.2f"%(loss))
    print("AUC score:", auc)
    print("Classification report:")
    print(classification_report(valid_targets, y_pred))
    return {
        "name": name, 
        "accuracy_score": acc, 
        "log_loss": loss, 
        "auc": auc
    }

In [None]:
train_features.shape

## Using CatBoost

In [None]:
cat_params = {
    'iterations': 15000, 
    'loss_function': 'Logloss', 
    'depth': 7, 
    'task_type' : 'GPU',
    'use_best_model': True,
    'eval_metric': 'AUC',
    'early_stopping_rounds': 1000,
    'learning_rate': 0.03,
    'border_count': 32,
    'l2_leaf_reg': 3,
    "verbose": 1000
}
cat = CatBoostClassifier(
    **cat_params
)
cat.fit(train_features, train_targets, eval_set=[(valid_features, valid_targets)])

In [None]:
probs = cat.predict_proba(valid_features)[:, 1]
probs[:10]

In [None]:
result_cat = evaluate(valid_targets, probs, "catboost")
result_cat

## Submisssion

In [None]:
claim = cat.predict_proba(test)[:, 1]
submission = pd.DataFrame({"id": list(test_ids), "claim": claim.reshape(-1)})
submission.to_csv("submission.csv", index=False)