# 1. Imports
Here we import the required packages.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, QuantileTransformer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from keras.backend import sigmoid
from numpy.random import seed

# 2. Data Preparation
Import the data and prepare it for use in the models.

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/train.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-nov-2021/test.csv")

In [None]:
features = train.columns.drop(['id','target'])

In [None]:
X = train[features]
y = train['target']

In [None]:
preprocessor = make_column_transformer((StandardScaler(), features))
#preprocessor = make_column_transformer((MinMaxScaler((0,1)), features))

In [None]:
X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(test[features])

# 3. Keras Tuning
Set up the KerasTuner for a random search. Some notes:
* We will use EarlyStopping and ReduceLROnPlateau to reduce overfitting, and we will use these same callbacks later when we submit the final model.
* We will use the Swish activation function in all models. We define it below with the tf.function decorator to make it a tensorflow function.
* I've included a crude skip to the end parameter calls "skips". The makes the labelled layer also connect to the final layer. This hasn't provided much value but could be used further.
* I've used a constant dropout rate for each layer, differing dropout rates would be better (but take longer to search).
* I've tuned on only 20% of the total data. This is an attempt to avoid effectively training the parameters to be a good fit for the validation data, and poor on test data. It also reduces the time spent on tuning.

In [None]:
early_stopping = callbacks.EarlyStopping(monitor='val_loss',patience=20,restore_best_weights=True,mode='min')
plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=0,mode='min')

@tf.function
def swish(x):
    return x*sigmoid(x)

In [None]:
def base_model(hp):

    num_input = keras.Input(shape=(X_test.shape[1],), name='num_data')#input layer
    
    l1_size = hp.Int("layer_1_size", min_value=32, max_value=512, step=32)
    l2_size = hp.Int("layer_2_size", min_value=32, max_value=512, step=32)
    l3_size = hp.Int("layer_3_size", min_value=32, max_value=512, step=32)
    l4_size = hp.Int("layer_4_size", min_value=32, max_value=512, step=32)
    #l5_size = hp.Int("layer_5_size", min_value=32, max_value=512, step=32)
    do_size = hp.Float("dropout_size",min_value=0.1,max_value=0.5,step=0.01)
    
    shape=[]
    shape.append(l1_size)
    shape.append(l2_size)
    shape.append(l3_size)
    shape.append(l4_size)
    #shape.append(l5_size)
    
    layer_list = []
    
    first_layer_size = shape.pop(0)
    l = layers.Dense(units=first_layer_size, activation=swish)(num_input)
    l = layers.BatchNormalization()(l)
    l = layers.Dropout(do_size)(l)
    layer_list.append(l)
    
    # Add one or more hidden layers
    for s in shape:
        l = layers.Dense(units=s, activation=swish)(l)
        l1 = layers.BatchNormalization()(l)
        l1 = layers.Dropout(do_size)(l)
        layer_list.append(l)
    
    skips = hp.Choice("Skips",values=['','1','2','3','4','12','13','14','23','24','34'])
    skips = [int(skip) for skip in skips]
    
    for skip in skips:
        l = layers.Concatenate()([l,layer_list[skip]])
        
    # A single output: our predicted target value probability
    out = keras.layers.Dense(1, activation='sigmoid', name='prediction')(l)
    
    model = keras.Model(
    inputs = [num_input],
    outputs = out,
    )
    
    model.compile(
        #optimizer='adam',
        optimizer = keras.optimizers.Adam(learning_rate=hp.Float("learning_rate", min_value=0.0001, max_value=0.001, step=0.0001)),
        loss='binary_crossentropy',
        metrics=[keras.metrics.AUC()]
    )
    
    return model

In [None]:
#Tuning
import keras_tuner
from keras_tuner import RandomSearch, BayesianOptimization
X_tune,_ , y_tune,_  = train_test_split(X,y,train_size=0.2)
X_tune_train, X_tune_test, y_tune_train, y_tune_test = train_test_split(X_tune,y_tune)

tuner = RandomSearch(base_model,keras_tuner.Objective("val_auc",direction="max"),max_trials=100, overwrite=True)

Uncomment the below two cells to carry out the tuning.

In [None]:
#tuner.search(X_tune_train,y_tune_train, epochs=100, batch_size=2048, validation_data = (X_tune_test,y_tune_test),callbacks=[early_stopping,plateau])

In [None]:
#tuner.results_summary()

# 4. Final model preparation and training
From the above, I found that a model with layers of size 32,328,384,288 with learning rate = 0.0008 and dropout rate 0.38 skipping the first and second layers worked well. Note the unconventional shape, but that's what the tuner found.

We'll train using a 5-fold cross validation method.

In [None]:
seed(42)
tf.random.set_seed(42)

def new_model(shape,dropout,skips=[]):
    '''Shape is the shape of the hidden layers. Dropout is the dropout percentage per layer. Skips is the list of layers which skip to the end'''
    layerlist = []
    
    inputs = keras.Input(shape=(X.shape[1],), name='Inputs')
    #Add first layer
    first_layer_size = shape.pop(0)
    l = layers.Dense(first_layer_size,activation=swish,input_shape=[X.shape[1]])(inputs)
    l = layers.BatchNormalization()(l)
    l = layers.Dropout(dropout)(l)
    layerlist.append(l)
    #Add other hidden layers
    for s in shape:
        l = layers.Dense(s,activation=swish)(l)
        l = layers.BatchNormalization()(l)
        l = layers.Dropout(dropout)(l)
        layerlist.append(l)
    #Add output layer
    for skip in skips:
        l = layers.Concatenate()([l,layerlist[skip]])
    out = layers.Dense(1,activation='sigmoid')(l)
    model = keras.Model(inputs,out)
    
    return model

early_stopping = callbacks.EarlyStopping(monitor='val_loss',patience=20,restore_best_weights=True,mode='min')
plateau = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=0,mode='min')

def new_model_train(shape,X_train,y_train,X_valid,y_valid,batch_size=2000,epochs=100,learning_rate=0.01,dropout=0.2,skips=[],verbose=1):
    string = ""
    for s in shape:
        string = string + "," + str(s)
    string = string + ";" + str(batch_size)
        
    model = new_model(shape,dropout,skips)
    #model = base_model()
    
    model.compile(
        #optimizer='adam',
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=[keras.metrics.AUC()]
    )

    history = model.fit(
        X_train,y_train,validation_data=(X_valid,y_valid),
        batch_size=batch_size,
        epochs=epochs,
        callbacks=[early_stopping,plateau],
        verbose=verbose
    )
    
    return model

In [None]:
auc_scores = []

def kfold_train(n_splits,shape,batch_size=2000,epochs=1000,learning_rate=0.01,dropout=0.2,skips=[],verbose=1):
    global test_pred
    test_pred = np.zeros(len(X_test))
    global train_oof
    train_oof = np.zeros(len(X))
    kf = StratifiedKFold(n_splits=n_splits,random_state=2020,shuffle=True)
    shape1 = shape.copy()
    for i, (train_index, test_index) in enumerate(kf.split(X,y)):
        print("Fitting Fold %2i/%2i" %(i+1,n_splits))
        shape = shape1.copy()
        X_train, X_valid = X[train_index], X[test_index]
        y_train, y_valid = y[train_index], y[test_index]
        model = new_model_train(shape,X_train,y_train,X_valid,y_valid,batch_size=batch_size,epochs=epochs,learning_rate=learning_rate,dropout=dropout,skips=[],verbose=verbose)
        valid_pred = model.predict(X_valid)
        score = roc_auc_score(y_valid,valid_pred)
        print("Fold %2i score: %1.6f" %((i+1),score))
        
        train_oof[test_index] = valid_pred[:,0]
        test_pred += model.predict(X_test)[:,0]/n_splits
    
    
    score = roc_auc_score(y,train_oof)
    auc_scores.append(score)
    print("----------------------------")
    print("Final score is: %1.6f" %score)
    print("----------------------------")

In [None]:
kfold_train(5,[32,320,384,288],2048,verbose=1,learning_rate=0.0008,dropout=0.38,skips=[0,1])

In [None]:
subm = pd.DataFrame(test['id'])
subm['target'] = test_pred
subm.to_csv('submission.csv',index=False)
subm