In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
from xgboost import XGBClassifier
import datatable as dt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_regression

%matplotlib inline
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import tensorflow as tf
from tensorflow.keras import layers

import shap
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import gc

In [None]:
%%time
train = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/test.csv')
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/sample_submission.csv')
memory_usage = train.memory_usage(deep=True) / 1024 ** 2
start_mem = memory_usage.sum()

In [None]:
%%time
feature_cols = [col for col in test.columns.tolist()]
useful_features = ["f22","f179","f69","f156","f58","f136","f214"]
n_clusters_1 = 12
cd_feature = True # cluster distance instead of cluster number
cluster_cols = [f"f{i+285}" for i in range(n_clusters_1)]
kmeans = KMeans(n_clusters=n_clusters_1, init="k-means++", max_iter=500, random_state=42)

if cd_feature:
    # train
    X_cd = kmeans.fit_transform(train[useful_features])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=train.index)
    train = train.join(X_cd)
    # test
    X_cd = kmeans.transform(test[useful_features])
    X_cd = pd.DataFrame(X_cd, columns=cluster_cols, index=test.index)
    test = test.join(X_cd)
    
else:
    # train
    train["cluster"] = kmeans.fit_predict(train[useful_features])
    # test
    test["cluster"] = kmeans.predict(test[useful_features])
    
    # one-hot encode
    ohe = OneHotEncoder()
    X_ohe = ohe.fit_transform(np.array(train["cluster"]).reshape(-1,1)).toarray()
    T_ohe = ohe.transform(np.array(test["cluster"]).reshape(-1,1)).toarray()

    X_ohe = pd.DataFrame(X_ohe, columns=cluster_cols, index=train.index)
    T_ohe = pd.DataFrame(T_ohe, columns=cluster_cols, index=test.index)

    train = pd.concat([train, X_ohe],axis=1)
    test = pd.concat([test, T_ohe],axis=1)

feature_cols += cluster_cols
train.head()

In [None]:
#fig = plt.figure(figsize = (10,5))

#if cd_feature:
#    sns.kdeplot(data=train[cluster_cols])
#else:
#    ax = sns.countplot(data=train, x='cluster', hue="target")
#    for p in ax.patches:
#        ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='white', size=5)

#plt.show()

In [None]:
def add_feature(df):
    df["f297"] = (df["f289"])/(df["f294"])
    df["f298"] = (df["f285"])/(df["f289"])
    df["f299"] = (df["f289"])/(df["f290"])
    df["f300"] = (df["f290"])/(df["f291"])
    df["f301"] = (df["f285"])/(df["f287"])
    df["f302"] = (df["f292"])/(df["f293"])
    df["f303"] = (df["f285"])/(df["f291"])
    return df

new_features = ["f297","f298","f299","f300","f301","f302","f303"]
train = add_feature(train)
test = add_feature(test)
feature_cols += new_features
train.head()

In [None]:
#%%time
#x = train.iloc[:5000,:][feature_cols].copy()
#y = train.iloc[:5000,:]['target'].copy()
#mi_scores = mutual_info_regression(x, y)
#mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
#mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
#top = 20
#fig = px.bar(mi_scores, x=mi_scores.values[:top], y=mi_scores.index[:top])
#fig.update_layout(
#    title=f"Top {top} Strong Relationships Between Feature Columns and Target Column",
#    xaxis_title="Relationship with Target",
#    yaxis_title="Feature Columns",
#    yaxis={'categoryorder':'total ascending'},
#    colorway=["blue"]
#)
#fig.show()

In [None]:
#from sklearn.preprocessing import RobustScaler
#scaler=RobustScaler()
#scaler.fit(train.iloc[:,train.columns!='target'])
#scaler.fit(test)
#print(train)
#print(test)

In [None]:
cnt_features =[]
cat_features =[]

for col in feature_cols:
    if train[col].dtype=='float64':
        cnt_features.append(col)
    else:
        cat_features.append(col)
        

train[cnt_features] = train[cnt_features].astype('float32')
train[cat_features] = train[cat_features].astype('uint8')

test[cnt_features] = test[cnt_features].astype('float32')
test[cat_features] = test[cat_features].astype('uint8')

memory_usage = train.memory_usage(deep=True) / 1024 ** 2
end_mem = memory_usage.sum()

In [None]:
print("Mem. usage decreased from {:.2f} MB to {:.2f} MB ({:.2f}% reduction)".format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))

In [None]:
%%time
bins = 128
n = 0
bins_list = []
bins_list.append(-np.inf)
for i in range(1,bins):
    n = n + 1./bins
    bins_list.append(n)
bins_list.append(np.inf)

labels = [i for i in range(bins)]
for col in cnt_features:
    train[col] = pd.cut(train[col], bins=bins_list, labels=labels)
    test[col] = pd.cut(test[col], bins=bins_list, labels=labels)
    
train.head()

In [None]:
memory_usage = train.memory_usage(deep=True) / 1024 ** 2
print("Mem. usage decreased from {:.2f} MB to {:.2f} MB ({:.2f}% reduction)".format(end_mem, memory_usage.sum(), 100 * (end_mem - memory_usage.sum()) / end_mem))

In [None]:
train[feature_cols] = train[feature_cols].astype('uint8')
test[feature_cols] = test[feature_cols].astype('uint8')

In [None]:
x1 = train[cnt_features].values
x2 = train[cat_features].values
y = train['target'].values

In [None]:
def get_model():
    AF = "relu"
    input_1 = layers.Input(shape=(x1.shape[-1]), name="continuous")
    #x_1 = layers.Embedding(input_dim=bins, output_dim=4)(input_1)
    #x_1 = layers.TimeDistributed(layers.Dense(64, activation=AF))(x_1)
    #x_1 = layers.TimeDistributed(layers.Dense(64, activation=AF))(x_1)
    #x_1 = layers.Flatten()(x_1)
    x_1 = layers.Dense(128, activation=AF)(input_1)
    x_1 = layers.Dense(128, activation=AF)(x_1)
    x_1 = layers.Dense(128, activation=AF)(x_1)
    
    input_2 = layers.Input(shape=x2.shape[-1], name="categories")
    x_2 = layers.Dense(128, activation=AF)(input_2)
    x_2 = layers.Dense(128, activation=AF)(x_2)
    x_2 = layers.Dense(128, activation=AF)(x_2)

    x = layers.Concatenate()([x_1,x_2])
    x = layers.Dense(64, activation=AF)(x)
    x = layers.Dense(128, activation=AF)(x)
    x = layers.Dense(256, activation=AF)(x)
    output = layers.Dense(1, activation="sigmoid", name="output")(x)

    model = tf.keras.Model([input_1,input_2], output)
    return model


model = get_model()
model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["AUC"])
    
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
cb_es = tf.keras.callbacks.EarlyStopping(monitor="val_auc", patience=3, mode="max", restore_best_weights=True, verbose=1)
cb_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_auc", factor=0.5, patience=2, min_lr=0.000001, verbose=1)

history = model.fit((x1,x2), 
                    y, 
                    epochs=20, 
                    validation_split=0.2, 
                    batch_size=512, 
                    validation_batch_size=512,
                    callbacks=[cb_es, cb_lr])

In [None]:
preds = model.predict((test[cnt_features].values, test[cat_features].values))

In [None]:
plt.figure(figsize=(15,8))
sns.histplot(x=preds.reshape(-1), kde=True, color="blue")
plt.title("Predictions Distribution")
plt.xlabel("Prediction")
plt.show()

In [None]:
sample_submission['target'] = np.squeeze(preds)
sample_submission.to_csv("submission.csv", index=False)