In [None]:
import numpy as np
import pandas as pd

%matplotlib inline
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

## Load Data

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv", index_col=0)
test = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv", index_col=0)
sample_submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

# Plot Data

In [None]:
train.hist(figsize=(20,15), grid=False, ylabelsize=5, xlabelsize=5)
plt.show()

# Correlation with Claim

In [None]:
corrs = train.corr()
corrs = corrs.sort_values(by=['claim'],ascending=False)

In [None]:
fig = plt.figure(figsize = (15,20))
sns.barplot(y=corrs.index[1:], x=corrs['claim'].values[1:], orient="h")
plt.title("Correlation Between Feature Columns and Target Column (Claim)")
plt.xlabel("Correlation with Target")
plt.ylabel("Feature Columns")
plt.show()

# Scale Data

In [None]:
feature_cols = [col for col in test.columns.tolist()]

scaler = StandardScaler()

train[feature_cols] = scaler.fit_transform(train[feature_cols])
test[feature_cols] = scaler.transform(test[feature_cols])

# Add Feature

In [None]:
def add_feature(df):
    df['n_nans'] = df[feature_cols].isnull().sum(axis=1)
    df['std']    = df[feature_cols].std(axis=1)
    df['mean']   = df[feature_cols].mean(axis=1)
    df['max']    = df[feature_cols].max(axis=1)
    df['skew']   = df[feature_cols].skew(axis=1)
    df['sum']    = df[feature_cols].sum(axis=1)
    df['var']    = df[feature_cols].var(axis=1)
    return df
        
train = add_feature(train).copy()
test = add_feature(test).copy()

feature_cols += ['n_nans', 'std', 'mean', 'max', 'skew', 'sum', 'var']

In [None]:
corrs = train.corr()
corrs = corrs.sort_values(by=['claim'],ascending=False)

In [None]:
fig = go.Figure(go.Bar(x=np.flip(corrs['claim'].values[1:11]), y=np.flip(corrs.index[1:11]), orientation='h'))
fig.update_layout(
    title="Top 10 Positive Correlation Between Feature Columns and Claim Column",
    xaxis_title="Correlation with Claim",
    yaxis_title="Feature Columns",
    colorway=["blue"]
)
fig.show()

# KFold Data

In [None]:
train["kfold"] = -1
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_indicies, valid_indicies) in enumerate(kf.split(train,train["claim"])):
    train.loc[valid_indicies, "kfold"] = fold

# CatBoost

In [None]:
final_test_predictions = []
final_valid_predictions = {}
scores = []
cats = []

for fold in range(5):
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test  = test.copy()
    
    y_train = x_train['claim']
    y_valid = x_valid['claim']
    
    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]
    
    valid_ids = x_valid.index

    cat_params = {
        'iterations': 15000, 
        'random_seed': 42,
        'loss_function': 'Logloss', 
        'depth': 8, 
        'task_type' : 'GPU',
        'use_best_model': True,
        'eval_metric': 'AUC',
        'early_stopping_rounds': 1000,
        'learning_rate': 0.03,
        'border_count': 32,
        'l2_leaf_reg': 3,
        'verbose': False,
    }
    # train model
    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])
    cats.append(cat_model)
    # predict values
    preds_valid = cat_model.predict_proba(x_valid)[:, 1]
    preds_test  = cat_model.predict_proba(test)[:, 1]
    # store predicted values
    final_test_predictions.append(preds_test)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    # measure metrics
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold",fold, ", AUC:", auc)
    scores.append(auc)
    
print("AVG AUC:",np.mean(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_cb"]
final_valid_predictions.to_csv("train_pred_cb.csv", index=False)

ss = sample_submission.copy()
ss['claim'] = np.mean(np.column_stack(final_test_predictions), axis=1)
ss.columns = ["id", "pred_cb"]
ss.to_csv("test_pred_cb.csv", index=False)

cb_test_preds_folds = final_test_predictions

# XGBoost

In [None]:
final_test_predictions = []
final_valid_predictions = {}
scores = []
xgbs = []

for fold in range(5):
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test  = test.copy()
    
    y_train = x_train['claim']
    y_valid = x_valid['claim']
    
    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]
    
    valid_ids = x_valid.index
    xgb_params = {
          'max_depth': 2, 
          'learning_rate': 0.021537077920105466, 
          'n_estimators': 10606, 
          'min_child_weight': 150, 
          'gamma': 0.11611920725914951, 
          'alpha': 0.0021839958087869794, 
          'lambda': 0.0018567979557499344, 
          'colsample_bytree': 0.7139742731494992, 
          'subsample': 0.6258627743440968,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'seed': 42,
          'use_label_encoder': False,
          'eval_metric': 'auc'
    }
    
    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)
    xgbs.append(xgb_model)
    # predict values
    preds_valid = xgb_model.predict_proba(x_valid)[:,1]
    preds_test  = xgb_model.predict_proba(test)[:,1]
    # store predicted values
    final_test_predictions.append(preds_test)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    # measure metrics
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold",fold, ", AUC:", auc)
    scores.append(auc)
    
    
print("AVG AUC:",np.mean(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id", "pred_xgb"]
final_valid_predictions.to_csv("train_pred_xgb.csv", index=False)

ss = sample_submission.copy()
ss['claim'] = np.mean(np.column_stack(final_test_predictions), axis=1)
ss.columns = ["id", "pred_xgb"]
ss.to_csv("test_pred_xgb.csv", index=False)

xgb_test_preds_folds = final_test_predictions

# Plot Test Set Predictions

In [None]:
test_preds_folds = cb_test_preds_folds + xgb_test_preds_folds
labels = [f'CatBoost fold {i}' if i < 5 else f'XGBoost fold {i-5}' for i in range(10)]

fig = ff.create_distplot(test_preds_folds, labels, bin_size=.3, show_hist=False, show_rug=False)
fig.show()

# Blending

In [None]:
df = train.copy()
df_test = test.copy()

df1 = pd.read_csv("train_pred_cb.csv")
df2 = pd.read_csv("train_pred_xgb.csv")

df_test1 = pd.read_csv("test_pred_cb.csv")
df_test2 = pd.read_csv("test_pred_xgb.csv")

df = df.merge(df1, on="id", how="left")
df = df.merge(df2, on="id", how="left")

df_test = df_test.merge(df_test1, on="id", how="left")
df_test = df_test.merge(df_test2, on="id", how="left")

df.head()

# Correlation with Predicted Values

In [None]:
corrs = df.drop(columns=["kfold"]).corr()
corrs = corrs.sort_values(by=['claim'],ascending=False)

In [None]:
fig = go.Figure(go.Bar(x=np.flip(corrs['claim'].values[1:11]), y=np.flip(corrs.index[1:11]), orientation='h'))
fig.update_layout(
    title="Top 10 Positive Correlation Between Feature Columns and Claim Column",
    xaxis_title="Correlation with Claim",
    yaxis_title="Feature Columns",
    colorway=["blue"]
)
fig.show()

In [None]:
useful_features = ["n_nans", "pred_cb", "pred_xgb"]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    x_train = df[df.kfold != fold].copy()
    x_valid = df[df.kfold == fold].copy()
    x_test = df_test.copy()

    y_train = x_train['claim']
    y_valid = x_valid['claim']
    
    x_train = x_train[useful_features]
    x_valid = x_valid[useful_features]
    params = {
          'max_depth': 2, 
          'learning_rate': 0.021537077920105466, 
          'n_estimators': 10606, 
          'min_child_weight': 150, 
          'gamma': 0.11611920725914951, 
          'alpha': 0.0021839958087869794, 
          'lambda': 0.0018567979557499344, 
          'colsample_bytree': 0.7139742731494992, 
          'subsample': 0.6258627743440968,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'seed': 42,
          'use_label_encoder': False,
          'eval_metric': 'auc'
    }
    model = XGBClassifier(**params)
    model.fit(x_train, y_train)
    
    preds_valid = model.predict_proba(x_valid)[:,1]
    test_preds = model.predict_proba(x_test)[:,1]
    final_predictions.append(test_preds)
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold",fold, ", AUC:", auc)
    scores.append(auc)

print("AVG AUC:",np.mean(scores))
sample_submission['claim'] = np.mean(np.column_stack(final_predictions), axis=1)

# Plot Test Set Predictions

In [None]:
data = [df_test1.pred_cb, df_test2.pred_xgb, sample_submission.claim]

labels = ['CatBoost', 'XGBoost', 'Blending']

fig = ff.create_distplot(data, labels, bin_size=.3, show_hist=False, show_rug=False)
fig.show()

## Submission

In [None]:
sample_submission.to_csv("submission.csv", index=False)