### This Notebook use the approach defined in https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers

It builds a set of specialized layer to transform categorical feature and to normalize numerical features
and concatenates all to create the input to a fully connected network

The Notebook shows the following **techniques:**
* how to put all the preprocessing inside the network
* how to use TF dataset
* how to use K-fold Cross validation to improve accuracy in validation
* saving best model (lower val_loss) for each fold

the Notebook has achieved **AUC = 0.871**
The Notebook is intended to explore and show the technique.
It is not a record (and not my best result in this competition. Better results with NN requires extensive hyper-parameter optimizations and I think it easier to get a better score
with GBM.

In [None]:
import numpy as np 
import pandas as pd 
import random as rn
import tensorflow as tf

from tensorflow.keras.backend import clear_session
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

import os

import logging
# added to remove TF warnings !
logger = tf.get_logger()
logger.setLevel(logging.ERROR)

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
print(tf.__version__)

In [None]:
DEVICE = 'GPU'

if DEVICE == "GPU":
    n_gpu = len(tf.config.experimental.list_physical_devices('GPU'))
    print("Num GPUs Available: ", n_gpu)
    
    if n_gpu > 1:
        print("Using strategy for multiple GPU")
        strategy = tf.distribute.MirroredStrategy()
    else:
        print('Standard strategy for GPU...')
        strategy = tf.distribute.get_strategy()

AUTO     = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync

print(f'REPLICAS: {REPLICAS}')

In [None]:
def enable_reproducibility(seed):
    SEED = seed
    os.environ['PYTHONHASHSEED'] = '0'
    # The below is needed for starting Numpy generated random numbers
    # in a well-defined initial state.
    np.random.seed(SEED)
    # The below is necessary for starting core Python generated random numbers
    # in a well-defined state.
    rn.seed(SEED)
    tf.random.set_seed(SEED)
    
enable_reproducibility(1234)

In [None]:
BASE_DIR = '/kaggle/input/tabular-playground-series-mar-2021'

FILE_TRAIN = BASE_DIR + '/train.csv'
FILE_TEST = BASE_DIR + '/test.csv'
FILE_SAMPLE = BASE_DIR + '/sample_submission.csv'

In [None]:
# globals
FOLDS = 5

BATCH_SIZE = 128
EPOCHS = 15
MAX_TOKENS = 100

PREDICTOR = 'target'

In [None]:
orig_data = pd.read_csv(FILE_TRAIN)

In [None]:
orig_data.columns

In [None]:
# prepare TF dataset
def df_to_dataset(df, predictor,  shuffle=True, batch_size=32):
    df = df.copy()
    labels = df.pop(predictor)
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    
    ds = ds.batch(batch_size)
    # ds = ds.prefetch(batch_size)
    return ds

In [None]:
# this function will be used for all numerical (cont) columns
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for our feature.
  normalizer = preprocessing.Normalization()

  # Prepare a Dataset that only yields our feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [None]:
# let's prepare a first version of train_ds for creating the model (neededto define mean, std for normalization)
# half the total data is OK
FRAC = 0.5

N_TRAIN = int(orig_data.shape[0] * FRAC)
df_train = orig_data[:N_TRAIN]
ds_train = df_to_dataset(df_train, PREDICTOR,  shuffle=True, batch_size=256)

In [None]:
# let's prepare continuous features
num_col_list = [ 'cont0', 'cont1', 'cont2', 'cont3', 'cont4',
       'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10']

# for each of the features I want in input I have to update those two lists:
all_inputs = []
encoded_features = []

# Numeric features.
for header in num_col_list:
    print('preparing', header)
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, ds_train)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

In [None]:
# this function will be used for all categorical (cont) columns, that will be one-hot encoded
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a StringLookup layer which will turn strings into integer indices
  if dtype == 'string':
    index = preprocessing.StringLookup(max_tokens=max_tokens)
  else:
    index = preprocessing.IntegerLookup(max_values=max_tokens)

  # Prepare a Dataset that only yields our feature
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Create a Discretization for our integer indices.
  encoder = preprocessing.CategoryEncoding(max_tokens=index.vocab_size())

  # Prepare a Dataset that only yields our feature.
  feature_ds = feature_ds.map(index)

  # Learn the space of possible indices.
  encoder.adapt(feature_ds)

  # Apply one-hot encoding to our indices. The lambda function captures the
  # layer so we can use them, or include them in the functional model later.
  return lambda feature: encoder(index(feature))

In [None]:
# let's add some categorical features (I'll start with low dimensional, with < 5 distinct values)
cat_col_list = ['cat0', 'cat11', 'cat12','cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18']

for header in cat_col_list:
    print('preparing', header)
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
    encoding_layer = get_category_encoding_layer(header, ds_train, dtype='string',
                                               max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

In [None]:
# this function builds the model
# rather simple multi-level NN, with 3 layers
def build_model(n_units):
    # use functional API
    # concatenate all input columns
    all_features = tf.keras.layers.concatenate(encoded_features)
    # the 'traditional' NN
    # x = tf.keras.layers.Dropout(0.1)(all_features)
    x = tf.keras.layers.Dense(n_units, activation="relu")(all_features)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(n_units, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.1)(x)
    x = tf.keras.layers.Dense(n_units, activation="relu")(x)
    # x = tf.keras.layers.Dropout(0.1)(x)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)
    
    model = tf.keras.Model(all_inputs, output)
    
    model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=["AUC"])
    return model

In [None]:
model = build_model(32)

In [None]:
model.summary()

In [None]:
# with this we get a nice picture of the NN, the layers on the left are the 'preprocessing layers'
# rankdir='LR' is used to make the graph horizontal.
tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

In [None]:
# here we do the training
# K-fold CV, we save for each fold the best epoch
# adding K-fold CV
skf = KFold(n_splits = FOLDS, shuffle = True, random_state=42)

# for others investigations
# we store all the history
histories = []

avg_auc = 0.

# these will be split in folds
for fold,(idxT,idxV) in enumerate(skf.split(orig_data)):
    n_fold = fold + 1
    print()
    print('***** Fold n.', n_fold)
    
    df_train = orig_data.iloc[idxT]
    df_valid = orig_data.iloc[idxV]
    
    # create tf dataset
    ds_train = df_to_dataset(df_train, PREDICTOR,  shuffle=True, batch_size=BATCH_SIZE)
    ds_valid = df_to_dataset(df_valid, PREDICTOR,  shuffle=False, batch_size=BATCH_SIZE)
    
    # clear
    clear_session()
    
    with strategy.scope():
        model = build_model(32)
    
    # don't use h5 format (to avoid a problem)
    sv = tf.keras.callbacks.ModelCheckpoint(
        'fold-%i'%n_fold, monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='min', save_freq='epoch')
    
    history = model.fit(ds_train, validation_data=ds_valid,
                        epochs=EPOCHS, verbose=1, callbacks = [sv])
    
    # save all histories
    histories.append(history)
    
    # reload the best model
    model.load_weights('fold-%i'%n_fold)
    
    results = model.evaluate(ds_valid)
    
    avg_auc += results[1]
    
# compute avg AUC across folds
avg_auc = avg_auc/float(FOLDS)

print()
print('Average AUC across folds is', round(avg_auc, 4))

In [None]:
def plot_loss(hist, skip):
    plt.figure(figsize=(14,6))
    
    plt.plot(hist.history['loss'][skip:], label='Training loss')
    plt.plot(hist.history['val_loss'][skip:], label='Validation loss')
    plt.title('Loss')
    plt.legend(loc='upper right')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.grid(True)
    plt.show();

def plot_auc(hist, skip):
    plt.figure(figsize=(14,6))
    
    plt.plot(hist.history['auc'][skip:], label='Training AUC')
    plt.plot(hist.history['val_auc'][skip:], label='Validation AUC')
    plt.title('AUC')
    plt.legend(loc='upper right')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.grid(True)
    plt.show();

In [None]:
for fold in range(FOLDS):
    plot_loss(histories[fold], skip=0)

In [None]:
for fold in range(FOLDS):
    plot_auc(histories[fold], skip=0)

### Prepare the evaluation on test set

In [None]:
orig_test = pd.read_csv(FILE_TEST)

orig_test.head()

In [None]:
# higher batch size
# this one doesn't reference targe (obviously)
def df_test_to_dataset(df, batch_size=32):
    df = df.copy()
    ds = tf.data.Dataset.from_tensor_slices(dict(df))
    ds = ds.batch(batch_size)
    return ds

ds_test = df_test_to_dataset(orig_test, batch_size=512)

In [None]:
# prepare for average
avg_preds = np.zeros(orig_test.shape[0])

for fold in range(FOLDS):
    n_fold = fold + 1
    
    print('Predictions with fold n.', n_fold)
    
    # load best model for fold
    model = build_model(32)
    model.load_weights('fold-%i'%n_fold)
    
    # get the probability for predictions
    preds = model.predict(ds_test)
    
    # make it one dimensional
    preds =  preds.reshape(preds.shape[0])
    
    avg_preds += preds
    
avg_preds = avg_preds/float(FOLDS)

In [None]:
# prepare submission file
dict_sub = {"id": orig_test['id'],
           "target": avg_preds}

SUB_NAME = 'submission00.csv'

df_submission = pd.DataFrame(dict_sub)

# df_submission.head()

df_submission.to_csv(SUB_NAME, index=False)

In [None]:
df_submission.head()