# Welcome to the September 2021 Tabular Playground Competition! #

In this competition, we predict whether a customer will make an insurance claim.

# Step1: Import Helpful Libraries #

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

# Step2: Load Data

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv", index_col='id')
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv", index_col='id')

FEATURES = list(df_train.columns[:-1])
TARGET = df_train.columns[-1]

df_train.head()

The target `'claim'` has binary outcomes: `0` for no claim and `1` for claim.

In [None]:
df_train.info()
df_train.describe()

# Missing Values
Refer to [TPS Sep 2021 single LGBM](https://www.kaggle.com/hiro5299834/tps-sep-2021-single-lgbm/notebook) by [@hiro5299834](https://www.kaggle.com/hiro5299834)

In [None]:
df_train['n_missing'] = df_train[FEATURES].isna().sum(axis=1)
test['n_missing'] = test[FEATURES].isna().sum(axis=1)

df_train['std'] = df_train[FEATURES].std(axis=1)
test['std'] = test[FEATURES].std(axis=1)

df_train['mean'] = df_train[FEATURES].mean(axis=1)
test['mean'] = test[FEATURES].mean(axis=1)

FEATURES += ['n_missing', 'std', 'mean']

# Step3: Train Model #

Let's try out a simple XGBoost model. This algorithm can handle missing values, but you could try imputing them instead.  We use `XGBClassifier` (instead of `XGBRegressor`, for instance), since this is a classification problem.

In [None]:
from xgboost import XGBClassifier

X = df_train.loc[:, FEATURES]
y = df_train.loc[:, TARGET]

final_predictions = []
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=X)):
    X_train = X.loc[train_indicies]
    X_valid = X.loc[valid_indicies]
    X_test = test.copy()
    
    y_train = y.loc[train_indicies]
    y_valid = y.loc[valid_indicies]
    
    
    scaler = StandardScaler()
    X_train[FEATURES] = scaler.fit_transform(X_train[FEATURES])
    X_valid[FEATURES] = scaler.transform(X_valid[FEATURES])
    X_test[FEATURES] = scaler.transform(X_test[FEATURES])
    
    model = XGBClassifier(
        max_depth=3,
        subsample=0.5,
        colsample_bytree=0.5,
        learning_rate= 0.01187431306013263,
        n_estimators= 10000,
        n_jobs=-1,
        use_label_encoder=False,
        objective='binary:logistic',
        tree_method='gpu_hist',  # Use GPU 
        gpu_id=0,
        predictor='gpu_predictor',
    )
    model.fit(X_train, y_train,
             verbose = False,
             eval_set = [(X_train, y_train), (X_valid, y_valid)],
             eval_metric = "auc",
             early_stopping_rounds = 200)
    
    preds_valid = model.predict_proba(X_valid)[:,1]
    preds_test = model.predict_proba(X_test)[:,1]
    final_predictions.append(preds_test)
    print(fold, roc_auc_score(y_valid, preds_valid))

# Make Submission #

Our predictions are binary 0 and 1, but you're allowed to submit probabilities instead. In scikit-learn, you would use the `predict_proba` method instead of `predict`.

In [None]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

# Make predictions
y_pred = pd.Series(
    preds,
    index=X_test.index,
    name=TARGET,
)

# Create submission file
y_pred.to_csv("submission.csv")