## Import Packages

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score


## Import datasets

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")

In [None]:
train.pop("id")
test_ids = test.pop("id")
train_targets = train.pop("claim")
train_targets.head()

### Introducing new features

In [None]:
for data in [train, test]:
    data['n_nans'] = data.isnull().sum(axis=1)
    data['std'] = data.std(axis=1)
    data['var'] = data.var(axis=1)
    data['abs_sum'] = data.abs().sum(axis=1)
    data['sem'] = data.sem(axis=1)
    data['avg'] = data.mean(axis=1)
    data['max'] = data.max(axis=1)
    data['min'] = data.min(axis=1)

In [None]:
features = []
for feature in train.columns:
    features.append(feature)
print(features)

In [None]:
train['missing'] = train[features].isna().sum(axis=1)
test['missing'] = test[features].isna().sum(axis=1)

In [None]:
from sklearn.impute import SimpleImputer
ss = SimpleImputer(missing_values=np.nan, strategy='mean')
train[features] = ss.fit_transform(train[features])
test[features] = ss.transform(test[features])

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
train[features] = scale.fit_transform(train[features])
test[features] = scale.transform(test[features])

## Feature Scaling

In [None]:
for item in train.columns:
    if item == "n_nans" or item == "std" or item == "var" or item == "abs_sum" or item == "sem" or item == "avg" or item == "min" or item == "max":
        continue
    #if abs(train[item].max()) / (abs(train[item].min()) + 10e-10) > 10:
    #    train[item] = np.sign(train[item]) * np.log2(np.abs(train[item]) + 1)
    #    test[item] = np.sign(test[item]) * np.log2(np.abs(test[item]) + 1)
    train_mean = train[item].mean()
    train_std = train[item].std()
    train[item] = (train[item] - train_mean) / train_std
    test[item] = (test[item] - train_mean) / train_std
    # Missing Value Imputation seems to have a bad effect to final results
    #train[item].replace(np.NAN, train[item].mean(), inplace=True)
    #test[item].replace(np.NAN, test[item].mean(), inplace=True)


### Train Validation Split

In [None]:
train_features, valid_features, train_targets, valid_targets = train_test_split(train, train_targets, test_size=0.2, random_state=np.random.randint(1000))
train_features.shape, train_targets.shape, valid_features.shape, valid_targets.shape

## Model Development & Evaluation


### Evaluation Method

In [None]:
def evaluate(valid_targets, probs, name):
    y_pred = np.array(probs > 0.5, dtype=int)
    acc = accuracy_score(valid_targets, y_pred)
    loss = log_loss(valid_targets, y_pred)
    auc = roc_auc_score(valid_targets, probs)
    print("Accuracy score: %.2f"%(acc))
    print("Log loss: %.2f"%(loss))
    print("AUC score:", auc)
    print("Classification report:")
    print(classification_report(valid_targets, y_pred))
    return {
        "name": name, 
        "accuracy_score": acc, 
        "log_loss": loss, 
        "auc": auc
    }

## Using CatBoost

In [None]:
cat_params = {
    'iterations': 15000, 
    'loss_function': 'Logloss', 
    'depth': 3, 
    'task_type' : 'GPU',
    'use_best_model': True,
    'eval_metric': 'AUC',
    'early_stopping_rounds': 1000,
    'learning_rate': 0.0337852,
    'border_count': 32,
    #'l2_leaf_reg': 3,
    "verbose": 1000,
    'bootstrap_type':'Poisson',
    'reg_lambda': 0.32796141625302366,
}
cat = CatBoostClassifier(**cat_params)
cat.fit(train_features, train_targets, eval_set=[(valid_features, valid_targets)])

In [None]:
probs = cat.predict_proba(valid_features)[:, 1]
probs[:10]

In [None]:
result_cat = evaluate(valid_targets, probs, "catboost")
result_cat

## Submisssion

In [None]:
claim = cat.predict_proba(test)[:, 1]
submission = pd.DataFrame({"id": list(test_ids), "claim": claim.reshape(-1)})
submission.to_csv("submission.csv", index=False)