In [None]:
import os
import gc
import random
from IPython.display import display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow_addons.optimizers import AdamW
import tensorflow.keras.layers as L
import tensorflow.keras.backend as K
from sklearn.svm import SVC

os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'

In [None]:
SEED = 42
BATCH_SIZE = 128
EPOCHS = 100
N_FOLDS = 5

In [None]:
def seedAll(seed):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
seedAll(SEED)

In [None]:
train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv")

In [None]:
print("Train Size: ",train.shape[0])
print("Test Size: ",test.shape[0])

In [None]:
train.head()

In [None]:
def buildEncoder():
    inp = L.Input((107,))
    h = L.Dense(64,activation="relu")(inp)
    h = L.BatchNormalization()(h)
    h = L.Dense(48,activation="relu")(h)
    h = L.BatchNormalization()(h)
    h = L.Dense(32,activation="relu")(h)
    h = L.BatchNormalization()(h)
    h = L.Dense(16,activation="relu")(h)
    
    model = tf.keras.Model(inputs = inp, outputs = h)
    return model

def buildDecoder():
    inp = L.Input((16,))
    h = L.Dense(32,activation="relu")(inp)
    h = L.Dense(48,activation="relu")(h)
    h = L.Dense(64,activation="relu")(h)
    h = L.Dense(107)(h)
    
    model = tf.keras.Model(inputs = inp,outputs = h)
    return model

def buildAutoencoder():
    encoder = buildEncoder()
    decoder = buildDecoder()
    inp = L.Input((107,))
    enc = encoder(inp)
    dec = decoder(enc)
     
    model = tf.keras.Model(inputs = inp,outputs = dec)
    model.compile(loss = "mse",optimizer = tf.keras.optimizers.Adam(lr = 0.001))
    return model
    
def buildModel(autoencoder):
    inp = L.Input((107,))
    encoder = autoencoder.layers[1]
    enc = encoder(inp)
#     enc = L.Concatenate(axis=1)([enc,inp[:,-7:]])
    h = L.BatchNormalization()(enc)
    h = L.Dense(8,activation="relu")(h)
    h = L.Dense(1,activation="sigmoid")(h)
    model = tf.keras.Model(inputs = inp,outputs = h)
    model.compile(loss="binary_crossentropy",optimizer = "adam",metrics = ["accuracy"])
    return model

In [None]:
h_skew = train.drop(["id","target"],axis=1).loc[:,train.drop(["id","target"],axis=1).skew() >= 2].columns
l_skew = train.drop(["id","target"],axis=1).loc[:,train.drop(["id","target"],axis=1).skew() < 2].columns

train['median_h'] = train[h_skew].median(axis=1)
test['median_h'] = test[h_skew].median(axis=1)

train['median_l'] = train[l_skew].median(axis=1)
test['median_l'] = test[l_skew].median(axis=1)

train['std_h'] = train[h_skew].std(axis=1)
test['std_h'] = test[h_skew].std(axis=1)

train['std_l'] = train[l_skew].std(axis=1)
test['std_l'] = test[l_skew].std(axis=1)

train['mean_l'] = train[l_skew].mean(axis=1)
test['mean_l'] = test[l_skew].mean(axis=1)

train['max_l'] = train[l_skew].max(axis=1)
test['max_l'] = test[l_skew].max(axis=1)

train['var_l'] = train[l_skew].var(axis=1)
test['var_l'] = test[l_skew].var(axis=1)

In [None]:
X = train.drop(["id","target"],axis=1)
X_test = test.drop(["id"],axis=1)
Y = train["target"]
sc = StandardScaler()
X = sc.fit_transform(X)
X_test = sc.transform(X_test)

In [None]:
skf = StratifiedKFold(n_splits = N_FOLDS)
scores = []
preds = []
for fold,(trn,val) in enumerate(skf.split(X,Y)):
    print("[INFO] Training Fold ",fold)
    
    model =  buildAutoencoder()
    if fold==0:
        display(tf.keras.utils.plot_model(model,show_shapes=True))
    rlr = tf.keras.callbacks.ReduceLROnPlateau(patience = 6,monitor="val_loss",mode="min",factor = 0.75)
    es = tf.keras.callbacks.EarlyStopping(monitor="val_loss",mode="min",patience = 10,restore_best_weights=True)

    print("\n[INFO] Training Autoencoder")
    history = model.fit(X[trn],X[trn],
                       epochs = EPOCHS,
                       validation_data = (X[val],X[val]),
                       batch_size = BATCH_SIZE,
                        callbacks = [rlr,es],
                        verbose = 0
                       )
    print("[INFO] Autoencoder best val_loss in epoch {}: ".format(np.argmin(history.history["val_loss"])),min(history.history["val_loss"]))
    print("[INFO] Training Model")
    rlr = tf.keras.callbacks.ReduceLROnPlateau(patience = 5,monitor="val_loss",mode="min",factor = 0.9,verbose=1)
    es = tf.keras.callbacks.EarlyStopping(monitor="val_loss",mode="min",patience = 20,restore_best_weights=True)
    ckpt = tf.keras.callbacks.ModelCheckpoint(f"model_{fold}.hdf5",monitor="val_loss",mode="min",save_best_only=True,save_weights_only=True)
    
    finModel = buildModel(model)
    if fold==0:
        finModel.summary()
    history = finModel.fit(X[trn],Y[trn],
                           epochs = EPOCHS,
                           validation_data = (X[val],Y[val]),
                           batch_size = BATCH_SIZE,
                           callbacks = [rlr,es,ckpt]
                          )

    score = roc_auc_score(Y[val],finModel.predict(X[val]))
    scores.append(score)
    preds.append(finModel.predict(X_test))
    print(f"[INFO] Fold {fold} roc auc: ",score)

    plt.figure()
    sns.lineplot(x=range(len(history.history["loss"])),y = history.history["loss"], color='green',label="train")
    sns.lineplot(x=range(len(history.history["val_loss"])),y = history.history["val_loss"], color='red',label="val")
    plt.title("Training Curve")
    plt.show()
    print("")
    del model,finModel,history
    _ = gc.collect()
    K.clear_session()

In [None]:
print("CV Score: ",np.mean(scores))

In [None]:
print(preds[0].shape)
print(np.mean(np.stack(preds,axis=1),axis=1).shape)

In [None]:
preds = np.stack(preds,axis=1)
preds = np.mean(preds,axis=1)
ss = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv")
ss["target"] = preds[:,0]
ss.to_csv("submission.csv",index=False)
ss