In [None]:
import os
import io
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import cudf

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

import tensorflow as tf
from tensorflow.keras import layers

from sklearn.preprocessing import MinMaxScaler

# Load Data

In [None]:
%%time
train = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv').set_index("id")
test = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv').set_index("id")
sample_submission = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")

feature_cols = test.columns.tolist()

# Scale Data

In [None]:
sc = MinMaxScaler()
train[feature_cols] = sc.fit_transform(train[feature_cols])
test[feature_cols] = sc.transform(test[feature_cols])

# Plot Features

In [None]:
train.hist(figsize=(20,15), grid=False, ylabelsize=5, xlabelsize=5, bins=30)
plt.show()

In [None]:
dist1 = [ 'f1', 'f3', 'f5','f6','f7','f8', 'f10','f11', 'f13','f14', 'f15', 'f17','f18','f21','f22','f25','f26','f29','f34','f36','f37',\
 'f38','f40','f41','f43','f45', 'f46',  'f50','f54','f55','f57', 'f75','f76', 'f77','f80','f82','f85','f86','f91','f96','f97']

dist2 = []
for col in feature_cols:
    if col not in dist1:
        dist2.append(col)

# Categorize Data

In [None]:
%%time
bins = 128
n = 0
bins_list = []

bins_list.append(-np.inf)
for i in range(1,bins):
    n += 1./bins
    bins_list.append(n)
bins_list.append(np.inf)

labels = [i for i in range(bins)]
for col in dist1:
    train[col] = pd.cut(train[col], bins=bins_list, labels=labels)
    test[col] = pd.cut(test[col], bins=bins_list, labels=labels)
    
train.head()

In [None]:
train[dist1] = train[dist1].astype('uint8')
test[dist1] = test[dist1].astype('uint8')

In [None]:
x1 = train[dist1].values
x2 = train[dist2].values
y  = train['target'].values

# Neural Network Model

In [None]:
def get_model():
    AF = "relu"
    input_1 = layers.Input(shape=(x1.shape[-1]), name="continuous")
    x_1 = layers.Embedding(input_dim=bins, output_dim=4)(input_1)
    x_1 = layers.TimeDistributed(layers.Dense(64, activation=AF))(x_1)
    x_1 = layers.TimeDistributed(layers.Dense(64, activation=AF))(x_1)
    x_1 = layers.Flatten()(x_1)
    x_1 = layers.Dense(128, activation=AF)(x_1)
    x_1 = layers.Dense(128, activation=AF)(x_1)
    
    input_2 = layers.Input(shape=x2.shape[-1], name="categories")
    x_2 = layers.Dense(128, activation=AF)(input_2)
    x_2 = layers.Dense(128, activation=AF)(x_2)

    x = layers.Concatenate()([x_1,x_2])
    x = layers.Dense(64, activation=AF)(x)
    x = layers.Dense(128, activation=AF)(x)
    output = layers.Dense(1, activation="sigmoid", name="output")(x)

    model = tf.keras.Model([input_1,input_2], output)
    return model


model = get_model()
model.compile(loss="binary_crossentropy", optimizer=tf.keras.optimizers.Adam(), metrics=["AUC"])
    
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
cb_es = tf.keras.callbacks.EarlyStopping(monitor="val_auc", patience=4, mode="max", restore_best_weights=True, verbose=1)
cb_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_auc", factor=0.5, patience=2, mode="max", min_lr=0.0001, verbose=1)

history = model.fit((x1,x2), 
                    y, 
                    epochs=40, 
                    validation_split=0.2, 
                    batch_size=512, 
                    validation_batch_size=512,
                    callbacks=[cb_es, cb_lr])

# Embeddings Projection

 You can uppload these two files (`vecs.tsv` and `meta.tsv`) on http://projector.tensorflow.org/ to visualize embedding layer

In [None]:
e = model.layers[1]
weights = e.get_weights()[0]
print(weights.shape)

words = [f"{i} ({np.round(bins_list[i],3)}-{np.round(bins_list[i+1],3)})" for i in labels]

vecs = io.open('vecs.tsv', 'w', encoding='utf-8')
meta = io.open('meta.tsv', 'w', encoding='utf-8')
for i in range(bins):
    vecs.write(words[i] + "\n")
    meta.write('\t'.join([str(x) for x in weights[i]]) + "\n")
vecs.close()
meta.close()

# Predict

In [None]:
preds = model.predict((test[dist1].values, test[dist2].values))

# Plot Predictions

In [None]:
plt.figure(figsize=(15,8))
sns.histplot(x=preds.reshape(-1), kde=True, color="blue")
plt.title("Predictions Distribution")
plt.xlabel("Prediction")
plt.show()

# Submission

In [None]:
sample_submission['target'] = np.squeeze(preds)
sample_submission.to_csv("submission.csv", index=False)