In [None]:
import numpy as np
import pandas as pd

%matplotlib inline
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans

# Data Preprocessing

## Load Data

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv", index_col=0)
test = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv", index_col=0)
sample_submission = pd.read_csv("../input/tabular-playground-series-sep-2021/sample_solution.csv")

## Plot Data

In [None]:
train.hist(figsize=(20,15), grid=False, ylabelsize=5, xlabelsize=5)
plt.show()

## Scale Data

In [None]:
feature_cols = [col for col in test.columns.tolist()]

scaler = StandardScaler()

train[feature_cols] = scaler.fit_transform(train[feature_cols])
test[feature_cols] = scaler.transform(test[feature_cols])

## Add Some Features

In [None]:
def add_feature(df):
    df['n_nans'] = df[feature_cols].isnull().sum(axis=1)
    df['std']    = df[feature_cols].std(axis=1)
    df['mean']   = df[feature_cols].mean(axis=1)
    df['max']    = df[feature_cols].max(axis=1)
    df['skew']   = df[feature_cols].skew(axis=1)
    df['sum']    = df[feature_cols].sum(axis=1)
    df['var']    = df[feature_cols].var(axis=1)
    return df
        
train = add_feature(train).copy()
test = add_feature(test).copy()

feature_cols += ['n_nans', 'std', 'mean', 'max', 'skew', 'sum', 'var']

## Simple Imputation

In [None]:
imp = SimpleImputer(strategy="median")
train[feature_cols] = imp.fit_transform(train[feature_cols])
test[feature_cols] = imp.transform(test[feature_cols])

# Correlation
Correlation can only detect **linear relationship** between columns

In [None]:
corrs = train.corr()
corrs = corrs.sort_values(by=['claim'],ascending=False)

In [None]:
fig = plt.figure(figsize = (15,20))
sns.barplot(y=corrs.index[1:], x=corrs['claim'].values[1:], orient="h")
plt.title("Correlation Between Feature Columns and Target Column (Claim)")
plt.xlabel("Correlation with Target")
plt.ylabel("Feature Columns")
plt.show()

# Mutual Information
Unlike the Correlation, **Mutual information** is not limited to linear relationship. It can detect any types of relationship between columns.

I had to get a subset of samples (5000 samples) in order to avoid getting OOM error.

In [None]:
x = train.iloc[:5000,:][feature_cols].copy()
y = train.iloc[:5000,:]['claim'].copy()
mi_scores = mutual_info_regression(x, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
top = 10
fig = px.bar(mi_scores, x=mi_scores.values[:top], y=mi_scores.index[:top])
fig.update_layout(
    title=f"Top {top} Strong Relationships Between Feature Columns and Claim Column",
    xaxis_title="Relationship with Claim",
    yaxis_title="Feature Columns",
    yaxis={'categoryorder':'total ascending'},
    colorway=["blue"]
)
fig.show()

# K-Means Clustering

Let's add clusters generated by K-Means (unsupervised algorithm) to see the effect.

In [None]:
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
train["cluster"] = kmeans.fit_predict(train[feature_cols])

train.head()

In [None]:
test["cluster"] = kmeans.predict(test[feature_cols])

feature_cols += ["cluster"]

In [None]:
fig = plt.figure(figsize = (10,5))
sns.barplot(data=train, x='cluster', y='claim')
plt.show()

## One-Hot Encode Cluster Column

In [None]:
ohe = OneHotEncoder()
X_ohe = ohe.fit_transform(np.array(train["cluster"]).reshape(-1,1)).toarray()
T_ohe = ohe.transform(np.array(test["cluster"]).reshape(-1,1)).toarray()

ohe_cols = [f"cluster{i+1}" for i in range(n_clusters)]

X_ohe = pd.DataFrame(X_ohe, columns=ohe_cols, index=train.index)
T_ohe = pd.DataFrame(T_ohe, columns=ohe_cols, index=test.index)

train = pd.concat([train, X_ohe],axis=1)
test = pd.concat([test, T_ohe],axis=1)

train.head()

In [None]:
feature_cols.pop(-1)
feature_cols += ohe_cols

train.drop("cluster", axis=1, inplace=True)
test.drop("cluster", axis=1, inplace=True)

# PCA

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(train[feature_cols])
T_pca = pca.transform(test[feature_cols])

pca_cols = ["PC1","PC2"]

X_pca = pd.DataFrame(X_pca, columns=pca_cols, index=train.index)
T_pca = pd.DataFrame(T_pca, columns=pca_cols, index=test.index)

train = pd.concat([train, X_pca], axis=1)
test = pd.concat([test, T_pca], axis=1)
train.head()

In [None]:
fig = plt.figure(figsize = (15,10))
sns.scatterplot(data=train, x="PC1", y="PC2", hue="claim")
plt.show()

In [None]:
feature_cols += pca_cols

In [None]:
# pca_features = ["f43", "f80", "f27", "f20"]

# pca = PCA()
# X_pca = pca.fit_transform(train[pca_features])
# T_pca = pca.transform(test[pca_features])

# pca_cols = [f"PC{i+1}" for i in range(X_pca.shape[1])]

# X_pca = pd.DataFrame(X_pca, columns=pca_cols, index=train.index)
# T_pca = pd.DataFrame(T_pca, columns=pca_cols, index=test.index)

# train = pd.concat([train, X_pca], axis=1)
# test = pd.concat([test, T_pca],axis=1)

# feature_cols += pca_cols

# train.head()

## MI Score for New Features?

In [None]:
x = train.iloc[:5000,:][feature_cols].copy()
y = train.iloc[:5000,:]['claim'].copy()
mi_scores = mutual_info_regression(x, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
top = 10
fig = px.bar(mi_scores, x=mi_scores.values[:top], y=mi_scores.index[:top])
fig.update_layout(
    title=f"Top {top} Strong Relationships Between Feature Columns and Claim Column",
    xaxis_title="Relationship with Claim",
    yaxis_title="Feature Columns",
    yaxis={'categoryorder':'total ascending'},
    colorway=["blue"]
)
fig.show()

# KFold Data

In [None]:
train["kfold"] = -1
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_indicies, valid_indicies) in enumerate(kf.split(train,train["claim"])):
    train.loc[valid_indicies, "kfold"] = fold

# XGBoost

In [None]:
final_test_predictions = []
scores = []

for fold in range(5):
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test  = test.copy()
    
    y_train = x_train['claim']
    y_valid = x_valid['claim']
    
    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]
    
    valid_ids = x_valid.index
    xgb_params = {
          'max_depth': 2, 
          'learning_rate': 0.021537077920105466, 
          'n_estimators': 10606, 
          'min_child_weight': 150, 
          'gamma': 0.11611920725914951, 
          'alpha': 0.0021839958087869794, 
          'lambda': 0.0018567979557499344, 
          'colsample_bytree': 0.7139742731494992, 
          'subsample': 0.6258627743440968,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'seed': 42,
          'use_label_encoder': False,
          'eval_metric': 'auc'
    }
    
    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)
    
    preds_valid = xgb_model.predict_proba(x_valid)[:,1]
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold",fold, ", AUC:", auc)
    scores.append(auc)
    
    preds_test = xgb_model.predict_proba(test)[:,1]
    final_test_predictions.append(preds_test)
    
    
print("AVG AUC:",np.mean(scores))

# Plot Test Predictions

In [None]:
labels = [f'fold {i}' for i in range(5)]

fig = ff.create_distplot(final_test_predictions, labels, bin_size=.3, show_hist=False, show_rug=False)
fig.show()

# Submission

In [None]:
sample_submission['claim'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)