![logo](https://www.goodrebels.com/wp-content/uploads/2018/12/181219_frankenstein_WP-1.png)

# The Frankenstein model
A lot of the public notebooks in this competition are using a DNN to achieve superior overfitting. While there has been very little (public) feature engineering. Feature engineering can be tedious work, so why not let the model itself find the best feature crossings? [This paper](https://arxiv.org/pdf/1708.05123.pdf) describes the concept.  

There hasn't been many 2D CNN models either (maybe for a good reason?). There is nothing (technically) stopping us from feeding a 1D feature vector into a 2D CNN model - it is just a matter of reshaping the 1D feature vector to 2D. So in this notebook we will combine all into a Deep and Cross and Convolutional Network - a Frankenstein model.

In [None]:
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_addons as tfa
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, AvgPool2D, BatchNormalization, Reshape, Activation
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import matplotlib.pyplot as plt

# Input data
The training data has been preprocessed in [this notebook](https://www.kaggle.com/mistag/jane-street-data-preprocessing).

In [None]:
train_all = pd.read_pickle('../input/jane-street-preprocessing/train_data.pkl')
train_all['action'] = (train_all.resp > 0).astype(int).to_list()
features = [c for c in train_all.columns if "feature" in c]

# Stratification
During training we will run a cross-validation scheme, and some stratification is done here on the data. A new feature column based on binned dates and feature_0 is created as the statification column.

In [None]:
train_all['date_bin'] = (pd.qcut(train_all['date'], q=4, labels=False)+1)*train_all['feature_0'] # stratify column
features = [c for c in train_all.columns if "feature" in c]
resp_cols = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4']
X_train = train_all.loc[:, train_all.columns.str.contains('feature|date_bin')]
y_train = pd.DataFrame(np.stack([(train_all[c] > 0).astype('int') for c in resp_cols]).T, columns = resp_cols)

del train_all

# Model building
Here we build the combined model of feature crossing, 2D CNN and DNN. Note the reshaping operation for the CNN. With 130 features, we have several options for 2D shape: 13x10, 10x13, 5x26, 26x5. The CNN model is inspired from [this notebook](https://www.kaggle.com/cdeotte/how-to-choose-cnn-architecture-mnist).   

There are several ways to put together these models. We can do sort of wide and deep with all three in parallel, or we can feed the output of the feature crossings to the CNN and DNN. 

In [None]:
dropout_rate = 0.2
cross_units = [130, 130, 130]
hidden_units = [130, 130, 130, 130]

def create_frankenstein_model(num_columns, x_shape, y_shape):

    x0 = tf.keras.layers.Input(shape=(num_columns,))   
    
    # feature crossing model
    cross = x0
    for _ in cross_units:
        units = cross.shape[-1]
        x = layers.Dense(units)(cross)
        cross = x0 * x + cross
    cross = layers.BatchNormalization()(cross)
    cross = Dropout(dropout_rate)(cross)

    # 2D CNN model 
    cnn = tf.keras.layers.Reshape((x_shape, y_shape, 1))(x0)  
    cnn = Conv2D(32,kernel_size=3,activation='relu',input_shape=(x_shape, y_shape, 1),padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Conv2D(32,kernel_size=3,activation='relu',padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Conv2D(32,kernel_size=5,strides=2,padding='same',activation='relu')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(dropout_rate)(cnn)
    cnn = Conv2D(64,kernel_size=3,activation='relu',padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Conv2D(64,kernel_size=3,activation='relu',padding='same')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Conv2D(64,kernel_size=5,strides=2,padding='same',activation='relu')(cnn)
    cnn = BatchNormalization()(cnn)
    cnn = Dropout(dropout_rate)(cnn)
    cnn = Flatten()(cnn)
    cnn = Dense(128, activation='relu')(cnn)
    cnn = Dropout(dropout_rate)(cnn)
    
    # DNN model
    deep = x0
    for _ in hidden_units:
        deep = Dense(units)(deep)
        deep = BatchNormalization()(deep)
        deep = Activation(tf.keras.activations.swish)(deep)
        deep = Dropout(dropout_rate)(deep)

    # merging the 3 models
    merged = layers.concatenate([cross, cnn, deep])
    x = Dense(len(resp_cols))(merged)
    out = tf.keras.layers.Activation("sigmoid")(x)
    model = keras.Model(inputs=x0, outputs=out)
    return model

# Training

In [None]:
MNAME = 'Frank'

def get_callbacks(idx):
    mc = ModelCheckpoint(MNAME+"-{}.h5".format(idx), save_best_only=True)
    rp = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)
    es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=False)
    return [mc, rp, es]

In [None]:
NFOLDS = 5

history = []
epochs = 200
batch_size = 4096
label_smoothing = 1e-2
learning_rate = 1e-3

skf = StratifiedKFold(n_splits=NFOLDS, shuffle = True, random_state = 42)

for i in range(NFOLDS):
    start = time.time()
    result = next(skf.split(X_train, X_train.date_bin), None)
    X_tr = X_train.iloc[result[0]].reset_index(drop=True)
    X_tr.drop(labels='date_bin', axis = 1, inplace=True)
    y_tr = y_train.iloc[result[0]].reset_index(drop=True)
    X_val = X_train.iloc[result[1]].reset_index(drop=True)
    X_val.drop(labels='date_bin', axis=1, inplace=True)
    y_val = y_train.iloc[result[1]].reset_index(drop=True)
    del result
    if i == NFOLDS - 1:
        del X_train, y_train
    
    tf.random.set_seed(42*i)
    model = create_frankenstein_model(len(features), 13, 10)
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )
    callbacks = get_callbacks(i)
    history.append(model.fit(X_tr, y_tr, epochs=epochs, batch_size=batch_size, validation_data=(X_val,y_val), callbacks=callbacks, verbose=0))
    
    del model, X_tr, y_tr, X_val, y_val # save precious memory
    print('fold {} training: {}'.format(i, time.strftime('%H:%M:%S', time.gmtime(time.time() - start))))

Let's take a look at the training curves:

In [None]:
colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']
plt.figure(figsize=(16,10))
for i in range(NFOLDS):
    plt.plot(history[i].history['AUC'], linestyle='-', color=colors[i], label='Train Fold {}'.format(str(i)))
for i in range(NFOLDS):
    plt.plot(history[i].history['val_AUC'], linestyle='--', color=colors[i], label='Validation Fold {}'.format(str(i)))
plt.title('Model AUC')
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend()
plt.show();

In [None]:
colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple']
plt.figure(figsize=(16,10))
for i in range(NFOLDS):
    plt.plot(history[i].history['loss'],linestyle='-', color=colors[i], label='Train Fold {}'.format(str(i)))
for i in range(NFOLDS):
    plt.plot(history[i].history['val_loss'],linestyle='--', color=colors[i], label='Validation Fold {}'.format(str(i)))
plt.title('Model Loss')
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend()
plt.show();

# Save training results
The training loss and AUC is saved here for later use during inference.

In [None]:
res = []
for i in range(NFOLDS):
    res.append([i, np.min(history[i].history['val_loss']), np.max(history[i].history['val_AUC'])])
rdf = pd.DataFrame(res, columns=['Model', 'Loss', 'AUC']).sort_values(by=['AUC'], ascending=False)
rdf.to_pickle('results.pkl')
rdf