In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import cudf

from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_regression

%matplotlib inline
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import shap

In [None]:
%%time
train = cudf.read_csv('../input/tabular-playground-series-oct-2021/train.csv', index_col=0)
test = cudf.read_csv('../input/tabular-playground-series-oct-2021/test.csv', index_col=0)

sample_submission = cudf.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv").to_pandas()

memory_usage = train.memory_usage(deep=True) / 1024 ** 2
start_mem = memory_usage.sum()

In [None]:
feature_cols = [col for col in test.columns.tolist()]

cnt_features =[]
cat_features =[]

for col in feature_cols:
    if train[col].dtype=='float64':
        cnt_features.append(col)
    else:
        cat_features.append(col)
        

train[cnt_features] = train[cnt_features].astype('float32')
train[cat_features] = train[cat_features].astype('uint8')

test[cnt_features] = test[cnt_features].astype('float32')
test[cat_features] = test[cat_features].astype('uint8')

memory_usage = train.memory_usage(deep=True) / 1024 ** 2
end_mem = memory_usage.sum()

train = train.to_pandas()
test = test.to_pandas()

In [None]:
print("Mem. usage decreased from {:.2f} MB to {:.2f} MB ({:.2f}% reduction)".format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))

# KMeans

In [None]:
%%time
useful_features = ["f22", "f179", "f69", "f58", "f214", "f78", "f136", "f156", "f8", "f3", "f77", "f200", "f92", "f185", "f142", "f115", "f284"]
n_clusters = 6
cd_feature = True # cluster distance instead of cluster number
cluster_cols = [f"cluster{i+1}" for i in range(n_clusters)]
kmeans = KMeans(n_clusters=n_clusters, n_init=50, max_iter=500, random_state=42)

if cd_feature:
    # train
    X_cd = kmeans.fit_transform(train[useful_features])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=train.index)
    train = train.join(X_cd)
    # test
    X_cd = kmeans.transform(test[useful_features])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=test.index)
    test = test.join(X_cd)
    
else:
    # train
    train["cluster"] = kmeans.fit_predict(train[useful_features])
    # test
    test["cluster"] = kmeans.predict(test[useful_features])
    
    # one-hot encode
    ohe = OneHotEncoder()
    X_ohe = ohe.fit_transform(np.array(train["cluster"]).reshape(-1,1)).toarray()
    T_ohe = ohe.transform(np.array(test["cluster"]).reshape(-1,1)).toarray()

    X_ohe = pd.DataFrame(X_ohe, columns=cluster_cols, index=train.index)
    T_ohe = pd.DataFrame(T_ohe, columns=cluster_cols, index=test.index)

    train = pd.concat([train, X_ohe],axis=1)
    test = pd.concat([test, T_ohe],axis=1)

feature_cols += cluster_cols
train.head()

In [None]:
fig = plt.figure(figsize = (10,5))

if cd_feature:
    sns.kdeplot(data=train[cluster_cols])
else:
    ax = sns.countplot(data=train, x='cluster', hue="target")
    for p in ax.patches:
        ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='white', size=5)

plt.show()

# PCA

In [None]:
pca = PCA()
X_pca = pca.fit_transform(train[useful_features])
T_pca = pca.transform(test[useful_features])

pca_cols = [f"PC{i+1}" for i in range(X_pca.shape[1])]

X_pca = pd.DataFrame(X_pca, columns=pca_cols, index=train.index)
T_pca = pd.DataFrame(T_pca, columns=pca_cols, index=test.index)

train = pd.concat([train, X_pca], axis=1)
test = pd.concat([test, T_pca], axis=1)
train.head()

In [None]:
loadings = pd.DataFrame(pca.components_, index=pca_cols, columns=train[useful_features].columns)
loadings.style.bar(align='mid', color=['#d65f5f', '#5fba7d'])

In [None]:
feature_cols += ["PC11", "PC12"]

# Add new features

In [None]:
def add_feature(df):
    df["new_f1"] = df["f255"]*df["f249"]
    df["new_f2"] = (df["cluster1"]+df["cluster3"])/(df["cluster2"]+df["cluster4"])
    return df

new_features = ["new_f1", "new_f2"]
train = add_feature(train)
test = add_feature(test)
feature_cols += new_features
train.head()

# Mutual Information

In [None]:
%%time
x = train.iloc[:5000,:][feature_cols].copy()
y = train.iloc[:5000,:]['target'].copy()
mi_scores = mutual_info_regression(x, y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
top = 20
fig = px.bar(mi_scores, x=mi_scores.values[:top], y=mi_scores.index[:top])
fig.update_layout(
    title=f"Top {top} Strong Relationships Between Feature Columns and Target Column",
    xaxis_title="Relationship with Target",
    yaxis_title="Feature Columns",
    yaxis={'categoryorder':'total ascending'},
    colorway=["blue"]
)
fig.show()

# KFold

In [None]:
folds = 5
train["kfold"] = -1
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

for fold, (train_indicies, valid_indicies) in enumerate(kf.split(train,train["target"])):
    train.loc[valid_indicies, "kfold"] = fold

# XGBoost

In [None]:
%%time
final_test_predictions = []
scores = []

for fold in range(folds):
    x_train = train[train.kfold != fold].copy()
    x_valid = train[train.kfold == fold].copy()
    x_test  = test[feature_cols].copy()
    
    y_train = x_train['target']
    y_valid = x_valid['target']
    
    x_train = x_train[feature_cols]
    x_valid = x_valid[feature_cols]

    xgb_params = {
        'eval_metric': 'auc', 
        'objective': 'binary:logistic', 
        'tree_method': 'gpu_hist', 
        'gpu_id': 0, 
        'predictor': 'gpu_predictor', 
        'n_estimators': 10000, 
        'learning_rate': 0.01063045229441343, 
        'gamma': 0.24652519525750877, 
        'max_depth': 4, 
        'seed': 42,       
        'min_child_weight': 366, 
        'subsample': 0.6423040816299684, 
        'colsample_bytree': 0.7751264493218339, 
        'colsample_bylevel': 0.8675692743597421, 
        'use_label_encoder': False,
        'lambda': 0, 
        'alpha': 10
    }
    
    xgb_model = XGBClassifier(**xgb_params)
    xgb_model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)
    
    preds_train = xgb_model.predict_proba(x_train)[:,1]
    preds_valid = xgb_model.predict_proba(x_valid)[:,1]
    auc_train = roc_auc_score(y_train, preds_train)
    auc = roc_auc_score(y_valid, preds_valid)
    print("Fold",fold,", train:", f"{auc_train:.6f}", ", valid:", f"{auc:.6f}")
    scores.append(auc)
    
    preds_test = xgb_model.predict_proba(x_test)[:,1]
    final_test_predictions.append(preds_test)
    
    
print("AVG AUC:",np.mean(scores))

# SHAP Values

In [None]:
shap_values = shap.TreeExplainer(xgb_model).shap_values(x_valid)
shap.summary_plot(shap_values, x_valid)

In [None]:
idx = 5
data_for_prediction = x_valid.iloc[idx]
data_for_prediction_array = data_for_prediction.values.reshape(1, -1)


print(xgb_model.predict_proba(data_for_prediction_array))

shap.initjs()
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(data_for_prediction_array)
shap.force_plot(explainer.expected_value, shap_values, data_for_prediction)

# Plot Prediction

In [None]:
labels = [f'fold {i}' for i in range(folds)]

fig = ff.create_distplot(final_test_predictions, labels, bin_size=.3, show_hist=False, show_rug=False)
fig.show()

In [None]:
sample_submission['target'] = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)