# Welcome to the September 2021 Tabular Playground Competition! #

In this competition, we predict whether a customer will make an insurance claim.

# Step1: Import Helpful Libraries #

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

# Step2: Load Data

In [None]:
df_train = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv", index_col='id')
test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv", index_col='id')

FEATURES = list(df_train.columns[:-1])
TARGET = df_train.columns[-1]

df_train.head()

The target `'claim'` has binary outcomes: `0` for no claim and `1` for claim.

# Missing Values
Refer to [TPS Sep 2021 single LGBM](https://www.kaggle.com/hiro5299834/tps-sep-2021-single-lgbm/notebook) by [@hiro5299834](https://www.kaggle.com/hiro5299834)

In [None]:
df_train['n_missing'] = df_train[FEATURES].isna().sum(axis=1)
test['n_missing'] = test[FEATURES].isna().sum(axis=1)

df_train['std'] = df_train[FEATURES].std(axis=1)
test['std'] = test[FEATURES].std(axis=1)

FEATURES += ['n_missing', 'std']
n_missing = df_train['n_missing'].copy()

# Step3: Train Model: XGBoost #

Let's try out a simple XGBoost model. This algorithm can handle missing values, but you could try imputing them instead.  We use `XGBClassifier` (instead of `XGBRegressor`, for instance), since this is a classification problem.

In [None]:
from xgboost import XGBClassifier

X = df_train.loc[:, FEATURES]
y = df_train.loc[:, TARGET]

final_test_predictions = []
final_valid_predictions = {}
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=X)):
    X_train = X.loc[train_indicies]
    X_valid = X.loc[valid_indicies]
    X_test = test.copy()
    
    valid_ids = X_valid.index.values.tolist()
    y_train = y.loc[train_indicies]
    y_valid = y.loc[valid_indicies]
    
    
    scaler = StandardScaler()
    X_train[FEATURES] = scaler.fit_transform(X_train[FEATURES])
    X_valid[FEATURES] = scaler.transform(X_valid[FEATURES])
    X_test[FEATURES] = scaler.transform(X_test[FEATURES])
    
    model = XGBClassifier(
        max_depth=3,
        subsample=0.5,
        colsample_bytree=0.5,
        learning_rate= 0.01187431306013263,
        n_estimators= 10000,
        n_jobs=-1,
        use_label_encoder=False,
        objective='binary:logistic',
        tree_method='gpu_hist',  # Use GPU 
        gpu_id=0,
        predictor='gpu_predictor',
    )
    model.fit(X_train, y_train,
             verbose = False,
             eval_set = [(X_train, y_train), (X_valid, y_valid)],
             eval_metric = "auc",
             early_stopping_rounds = 200)
    
    preds_valid = model.predict_proba(X_valid)[:,1]
    preds_test = model.predict_proba(X_test)[:,1]
    final_test_predictions.append(preds_test)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    print(fold, roc_auc_score(y_valid, preds_valid))

In [None]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_1"]
final_valid_predictions.to_csv("train_pred_1.csv", index=False)

sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sub.claim = np.mean(np.column_stack(final_test_predictions), axis=1)
sub.columns = ["id", "pred_1"]
sub.to_csv("test_pred_1.csv", index=False)

# Train Model: LGBM

In [None]:
from lightgbm import LGBMClassifier

X = df_train.loc[:, FEATURES]
y = df_train.loc[:, TARGET]

final_test_predictions = []
final_valid_predictions = {}
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=X)):
    X_train = X.loc[train_indicies]
    X_valid = X.loc[valid_indicies]
    X_test = test.copy()
    
    valid_ids = X_valid.index.values.tolist()
    y_train = y.loc[train_indicies]
    y_valid = y.loc[valid_indicies]
    
    
    scaler = StandardScaler()
    X_train[FEATURES] = scaler.fit_transform(X_train[FEATURES])
    X_valid[FEATURES] = scaler.transform(X_valid[FEATURES])
    X_test[FEATURES] = scaler.transform(X_test[FEATURES])
    
    model = LGBMClassifier(
        max_depth = 3,
        num_leaves = 7,
        n_estimators = 10000,
        colsample_bytree = 0.3,
        subsample = 0.5,
        random_state = 42,
        reg_alpha=18,
        reg_lambda=17,
        learning_rate = 0.095,
        device = 'gpu',
        objective= 'binary'
    )
    
    model.fit(X_train, y_train,
             verbose = False,
             eval_set = [(X_train, y_train), (X_valid, y_valid)],
             eval_metric = "auc",
             early_stopping_rounds = 200)
    
    preds_valid = model.predict_proba(X_valid)[:,1]
    preds_test = model.predict_proba(X_test)[:,1]
    final_test_predictions.append(preds_test)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    print(fold, roc_auc_score(y_valid, preds_valid))

In [None]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_2"]
final_valid_predictions.to_csv("train_pred_2.csv", index=False)

sub = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')
sub.claim = np.mean(np.column_stack(final_test_predictions), axis=1)
sub.columns = ["id", "pred_2"]
sub.to_csv("test_pred_2.csv", index=False)

# Step4: Blending

In [None]:
df = pd.read_csv("../input/tabular-playground-series-sep-2021/train.csv")
df_test = pd.read_csv("../input/tabular-playground-series-sep-2021/test.csv")

df1 = pd.read_csv("train_pred_1.csv")
df2 = pd.read_csv("train_pred_2.csv")

df_test1 = pd.read_csv("test_pred_1.csv")
df_test2 = pd.read_csv("test_pred_2.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")

df.head()

In [None]:
df_test.head(100)

In [None]:
from sklearn.linear_model import LogisticRegression

FEATURES = ["pred_1", "pred_2"]
df_test = df_test[FEATURES]

final_predictions = []
valid_predictions = []
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=X)):
    X_train =  df.loc[train_indicies]
    X_valid = df.loc[train_indicies]
    X_test = df_test.copy()

    y_train = X_train.claim
    y_valid = X_valid.claim
    
    scaler = StandardScaler()
    X_train[FEATURES] = scaler.fit_transform(X_train[FEATURES])
    X_valid[FEATURES] = scaler.transform(X_valid[FEATURES])
    X_test[FEATURES] = scaler.transform(X_test[FEATURES])
    
    X_train = X_train[FEATURES]
    X_valid = X_valid[FEATURES]
    
    model = LogisticRegression(fit_intercept=False)
    model.fit(X_train, y_train)
    
    preds_valid = model.predict_proba(X_valid)[:,1]
    preds_test = model.predict_proba(X_test)[:,1]
    final_predictions.append(preds_test)
    valid_predictions.append(preds_valid)
    print(fold, roc_auc_score(y_valid, preds_valid))

# Make Submission #

Our predictions are binary 0 and 1, but you're allowed to submit probabilities instead. In scikit-learn, you would use the `predict_proba` method instead of `predict`.

In [None]:
preds = np.mean(np.column_stack(final_predictions), axis=1)

# Make predictions
y_pred = pd.Series(
    preds,
    index=sub.id,
    name=TARGET,
)

# Create submission file
y_pred.to_csv("submission.csv")

In [None]:
y_pred.head(100)

In [None]:
print(valid_predictions)