In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LogisticRegression

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import shap

# Load Data

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv', index_col=0)
sample_submission = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
feature_cols = test.columns.tolist()

# Plot Data

In [None]:
train.hist(figsize=(20,15), grid=False, ylabelsize=5, xlabelsize=5, bins=30)
plt.show()

# Scale Data

In [None]:
sc = StandardScaler()
train[feature_cols] = sc.fit_transform(train[feature_cols])
test[feature_cols] = sc.transform(test[feature_cols])

# KMeans

In [None]:
useful_features = ["f34", "f55", "f43", "f8", "f91", "f80", "f71", "f27", "f50", "f97", "f41", "f25", "f57", "f66", "f22", "f96", "f82", "f26", "f81", "f40"]

In [None]:
# %%time

## You can uncomment and run the following lines to find the elbow point
# inertia = {}
# for i in range(2,18):
#     kmeans = KMeans(n_clusters=i, random_state=42)
#     kmeans.fit_predict(train[useful_features])
#     inertia.update({i:kmeans.inertia_})

# inertia_df = pd.Series(inertia)
# plt.plot(inertia_df,marker="o")
# plt.xticks(inertia_df.index)
# plt.xlabel("Number of clusters")
# plt.ylabel("Inertia")
# plt.show()

In [None]:
%%time
n_clusters = 5
cd_feature = True # cluster distance instead of cluster number  

kmeans = KMeans(n_clusters=n_clusters, random_state=42)

if cd_feature:
    cluster_cols = [f"cluster{i+1}" for i in range(n_clusters)]
    
    X_cd = kmeans.fit_transform(train[useful_features])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=train.index)
    train = train.join(X_cd)
    
    X_cd = kmeans.transform(test[useful_features])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=test.index)
    test = test.join(X_cd)

else:
    cluster_cols = ["cluster"]  
    train["cluster"] = kmeans.fit_predict(train[useful_features])
    test["cluster"] = kmeans.predict(test[useful_features])
    

feature_cols += cluster_cols

train.head()

# Adding New Features?

In [None]:
%%time
sns.pairplot(train[cluster_cols+["target"]].sample(1000, random_state=0), hue="target", diag_kind='kde')
plt.show()

In [None]:
train["new_f1"] = train["cluster1"]-train["cluster4"]
train["new_f2"] = train["cluster3"]-train["cluster4"]
test["new_f1"]  = test["cluster1"]-test["cluster4"]
test["new_f2"]  = test["cluster3"]-test["cluster4"]

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(data=train, x="new_f1", y="new_f2", hue="target", alpha=0.8)
plt.show()

In [None]:
train["new_f3"] = train["new_f1"]-train["new_f2"]
train["new_f4"] = train["new_f1"]+train["new_f2"]
test["new_f3"]  = test["new_f1"]-test["new_f2"]
test["new_f4"]  = test["new_f1"]+test["new_f2"]

In [None]:
plt.figure(figsize=(10,5))
sns.scatterplot(data=train, x="new_f3", y="new_f4", hue="target", alpha=0.8)
plt.show()

In [None]:
train.drop("new_f3", axis=1, inplace=True) # since it doesn't look promising based on above plots
test.drop("new_f3", axis=1, inplace=True)

feature_cols += ["new_f1", "new_f2", "new_f4"]

# Logistic Regression

In [None]:
folds = 5
train["kfold"] = -1
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

for fold, (train_indicies, valid_indicies) in enumerate(kf.split(train,train["target"])):
    train.loc[valid_indicies, "kfold"] = fold

In [None]:
%%time
scores = []

train["lr"] = 0
test["lr"] = 0
for fold in range(folds):
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test  = test[feature_cols].copy()
    
    y_train = x_train['target']
    y_valid = x_valid['target']
    
    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]

    
    lr_model = LogisticRegression()
    lr_model.fit(x_train, y_train)
    
    preds_train = lr_model.predict_proba(x_train)[:,1]
    preds_valid = lr_model.predict_proba(x_valid)[:,1]
    auc_train = roc_auc_score(y_train, preds_train)
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold",fold,", train:", f"{auc_train:.6f}", ", valid:", f"{auc:.6f}")
    scores.append(auc)
    
    preds_test = lr_model.predict_proba(x_test)[:,1]
    train["lr"].loc[x_valid.index] = preds_valid
    test["lr"] += preds_test
    
test["lr"] /= folds
print("AVG AUC:",np.mean(scores))

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(data=train, x="lr", y="new_f4", hue="target", alpha=0.8)
plt.show()

In [None]:
feature_cols.append("lr")

# PCA

In [None]:
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(train[feature_cols])
T_pca = pca.transform(test[feature_cols])

pca_cols = [f"PC{i+1}" for i in range(X_pca.shape[1])]

X_pca = pd.DataFrame(X_pca, columns=pca_cols, index=train.index)
T_pca = pd.DataFrame(T_pca, columns=pca_cols, index=test.index)

train = pd.concat([train, X_pca], axis=1)
test = pd.concat([test, T_pca], axis=1)
train.head()

In [None]:
loadings = pd.DataFrame(pca.components_, index=pca_cols, columns=train[feature_cols].columns)
loadings.style.bar(align='mid', color=['#d65f5f', '#5fba7d'])

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(data=train, x="PC1", y="PC2", hue="target", alpha=0.8)
plt.show()

In [None]:
feature_cols += ["PC1", "PC2"]

# Mutual Information

In [None]:
%%time
x = train.iloc[:5000,:][feature_cols].copy()
y = train.iloc[:5000,:]['target'].copy()
mi_scores = mutual_info_regression(x, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
top = 20
plt.figure(figsize=(20,10))
sns.barplot(x=mi_scores.values[:top], y=mi_scores.index[:top], palette="summer")
plt.title(f"Top {top} Strong Relationships Between Feature Columns and Target Column")
plt.xlabel("Relationship with Target")
plt.ylabel("Feature Columns")
plt.show()

# XGBoost

In [None]:
%%time
final_test_predictions = []
scores = []

for fold in range(folds):
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test  = test[feature_cols].copy()
    
    y_train = x_train['target']
    y_valid = x_valid['target']
    
    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]

    xgb_params = {
        'eval_metric': 'auc', 
        'objective': 'binary:logistic', 
        'tree_method': 'gpu_hist', 
        'gpu_id': 0, 
        'predictor': 'gpu_predictor', 
        'n_estimators': 10000, 
        'learning_rate': 0.01063045229441343, 
        'gamma': 0.24652519525750877, 
        'max_depth': 4, 
        'seed': 42,       
        'min_child_weight': 366, 
        'subsample': 0.6423040816299684, 
        'colsample_bytree': 0.7751264493218339, 
        'colsample_bylevel': 0.8675692743597421, 
        'use_label_encoder': False,
        'lambda': 0, 
        'alpha': 10
    }
    
    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)
    
    preds_train = xgb_model.predict_proba(x_train)[:,1]
    preds_valid = xgb_model.predict_proba(x_valid)[:,1]
    auc_train = roc_auc_score(y_train, preds_train)
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold",fold,", train:", f"{auc_train:.6f}", ", valid:", f"{auc:.6f}")
    scores.append(auc)
    
    preds_test = xgb_model.predict_proba(x_test)[:,1]
    final_test_predictions.append(preds_test)
    
    
print("AVG AUC:",np.mean(scores))

# SHAP Values

In [None]:
shap_values = shap.TreeExplainer(xgb_model).shap_values(x_valid)
shap.summary_plot(shap_values, x_valid)

In [None]:
shap.dependence_plot("lr", shap_values, x_valid)

In [None]:
idx = 5
data_for_prediction = x_valid.iloc[idx]
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)


print(xgb_model.predict_proba(data_for_prediction_array))

shap.initjs()
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(data_for_prediction_array)
shap.force_plot(explainer.expected_value, shap_values, data_for_prediction)

In [None]:
shap.decision_plot(explainer.expected_value, shap_values, data_for_prediction)

# Plot Prediction

In [None]:
plt.figure(figsize=(15,8))
sns.histplot(x=np.mean(np.column_stack(final_test_predictions), axis=1), kde=True, color="blue")
plt.title("Predictions Distribution")
plt.xlabel("Prediction")
plt.show()

# Submission

In [None]:
sample_submission['target'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)