Codes are forked from https://www.kaggle.com/gandagorn/gru-lstm-mix-with-custom-loss

Model is from https://github.com/philipperemy/keras-tcn

Single TCN doesn't work well with only 0.4 on public LB. So I use Two-layer Bidirectional TCN followed by LSTM or GRU.

Adding TCN makes model easier to converge but no improvement on the local validation set.


In [None]:
#the basics
import pandas as pd, numpy as np
import math, json, gc, random, os, sys
from matplotlib import pyplot as plt
from tqdm import tqdm

#tensorflow deep learning basics
import tensorflow as tf
import tensorflow_addons as tfa

import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Layer,Input,Activation, Lambda,Conv1D, SpatialDropout1D,Convolution1D,Dense,add,GlobalMaxPooling1D,GlobalAveragePooling1D,concatenate,Embedding

from tensorflow.keras.models import Model
from typing import List, Tuple
from tensorflow.keras.utils import plot_model

#for model evaluation
from sklearn.model_selection import train_test_split, KFold,  StratifiedKFold

In [None]:
#get comp data
train = pd.read_json('/kaggle/input/stanford-covid-vaccine/train.json', lines=True)
test = pd.read_json('/kaggle/input/stanford-covid-vaccine/test.json', lines=True)
sample_sub = pd.read_csv("/kaggle/input/stanford-covid-vaccine/sample_submission.csv")

In [None]:
#target columns
target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [None]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

train_inputs_all = preprocess_inputs(train)
train_labels_all = np.array(train[target_cols].values.tolist()).transpose((0, 2, 1))

In [None]:
def channel_normalization(x):
    # type: (Layer) -> Layer
    """ Normalize a layer to the maximum activation
    This keeps a layers values between zero and one.
    It helps with relu's unbounded activation
    Args:
        x: The layer to normalize
    Returns:
        A maximal normalized layer
    """
    max_values = K.max(K.abs(x), 2, keepdims=True) + 1e-5
    out = x / max_values
    return out


def wave_net_activation(x):
    # type: (Layer) -> Layer
    """This method defines the activation used for WaveNet
    described in https://deepmind.com/blog/wavenet-generative-model-raw-audio/
    Args:
        x: The layer we want to apply the activation to
    Returns:
        A new layer with the wavenet activation applied
    """
    tanh_out = Activation('tanh')(x)
    sigm_out = Activation('sigmoid')(x)
    return L.multiply([tanh_out, sigm_out])


def residual_block(x, s, i, activation, nb_filters, kernel_size, padding, dropout_rate=0):
    # type: (Layer, int, int, str, int, int, float, str) -> Tuple[Layer, Layer]
    """Defines the residual block for the WaveNet TCN
    Args:
        x: The previous layer in the model
        s: The stack index i.e. which stack in the overall TCN
        i: The dilation power of 2 we are using for this residual block
        activation: The name of the type of activation to use
        nb_filters: The number of convolutional filters to use in this block
        kernel_size: The size of the convolutional kernel
        padding: The padding used in the convolutional layers, 'same' or 'causal'.
        dropout_rate: Float between 0 and 1. Fraction of the input units to drop.
        name: Name of the model. Useful when having multiple TCN.
    Returns:
        A tuple where the first element is the residual model layer, and the second
        is the skip connection.
    """

    original_x = x
    conv = Conv1D(filters=nb_filters, kernel_size=kernel_size,
                  dilation_rate=i, padding=padding,
                  )(x)
    if activation == 'norm_relu':
        x = Activation('relu')(conv)
        x = Lambda(channel_normalization)(x)
    elif activation == 'wavenet':
        x = wave_net_activation(conv)
    else:
        x = Activation(activation)(conv)

    x = SpatialDropout1D(dropout_rate)(x)

    # 1x1 conv.
    x = Convolution1D(nb_filters, 1, padding='same')(x)
    res_x = L.add([original_x, x])
    return res_x, x


def process_dilations(dilations):
    def is_power_of_two(num):
        return num != 0 and ((num & (num - 1)) == 0)

    if all([is_power_of_two(i) for i in dilations]):
        return dilations

    else:
        new_dilations = [2 ** i for i in dilations]
        # print(f'Updated dilations from {dilations} to {new_dilations} because of backwards compatibility.')
        return new_dilations


class TCN(Layer):
    """Creates a TCN layer.
        Args:
            input_layer: A tensor of shape (batch_size, timesteps, input_dim).
            nb_filters: The number of filters to use in the convolutional layers.
            kernel_size: The size of the kernel to use in each convolutional layer.
            dilations: The list of the dilations. Example is: [1, 2, 4, 8, 16, 32, 64].
            nb_stacks : The number of stacks of residual blocks to use.
            activation: The activations to use (norm_relu, wavenet, relu...).
            padding: The padding to use in the convolutional layers, 'causal' or 'same'.
            use_skip_connections: Boolean. If we want to add skip connections from input to each residual block.
            return_sequences: Boolean. Whether to return the last output in the output sequence, or the full sequence.
            dropout_rate: Float between 0 and 1. Fraction of the input units to drop.
            name: Name of the model. Useful when having multiple TCN.
        Returns:
            A TCN layer.
        """

    def __init__(self,
                 nb_filters=64,
                 kernel_size=2,
                 nb_stacks=1,
                 dilations=None,
                 activation='norm_relu',
                 padding='causal',
                 use_skip_connections=True,
                 dropout_rate=0.0,
                 return_sequences=True,
                 ):
        super().__init__()
        self.return_sequences = return_sequences
        self.dropout_rate = dropout_rate
        self.use_skip_connections = use_skip_connections
        self.activation = activation
        self.dilations = dilations
        self.nb_stacks = nb_stacks
        self.kernel_size = kernel_size
        self.nb_filters = nb_filters
        self.padding = padding

        # backwards incompatibility warning.
        # o = tcn.TCN(i, return_sequences=False) =>
        # o = tcn.TCN(return_sequences=False)(i)

        if padding != 'causal' and padding != 'same':
            raise ValueError("Only 'causal' or 'same' paddings are compatible for this layer.")

        if not isinstance(nb_filters, int):
            print('An interface change occurred after the version 2.1.2.')
            print('Before: tcn.TCN(i, return_sequences=False, ...)')
            print('Now should be: tcn.TCN(return_sequences=False, ...)(i)')
            print('Second solution is to pip install keras-tcn==2.1.2 to downgrade.')
            raise Exception()

    def __call__(self, inputs):
        if self.dilations is None:
            self.dilations = [1, 2, 4, 8, 16, 32]
        x = inputs
        x = Convolution1D(self.nb_filters, 1, padding=self.padding)(x)
        skip_connections = []
        for s in range(self.nb_stacks):
            for i in self.dilations:
                x, skip_out = residual_block(x, s, i, self.activation, self.nb_filters,
                                             self.kernel_size, self.padding, self.dropout_rate)
                skip_connections.append(skip_out)
        if self.use_skip_connections:
            x = L.add(skip_connections)
        x = Activation('relu')(x)

        if not self.return_sequences:
            output_slice_index = -1
            x = Lambda(lambda tt: tt[:, output_slice_index, :])(x)
        return x


In [None]:
# custom loss_fnc
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

def gru_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.GRU(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.LSTM(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))


def build_model(seq_len=107, pred_len=68,embed_dim=100,units=128,dropout=0.4):
    
    inputs =Input(shape=(seq_len, 3))
    
    embed = Embedding(input_dim=len(token2int), output_dim=embed_dim)(inputs)
    reshaped = tf.reshape(embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))
    x = SpatialDropout1D(.2)(reshaped)
    
    x1 = TCN(2*units, return_sequences=True, dilations = [1, 2, 4, 8, 16,32])(x) #, activation = 'wavenet'
    x2 = K.reverse(x,axes=1)
    x2 = TCN(2*units, return_sequences=True, dilations = [1, 2, 4, 8, 16,32])(x2) #,dilations = [1, 2, 4]
    x =add([x1,K.reverse(x2,axes=1)])
    x1 = TCN(2*units, return_sequences=True, dilations = [1, 2, 4, 8, 16,32])(x) #, activation = 'wavenet'
    x2 = K.reverse(x,axes=1)
    x2 = TCN(2*units, return_sequences=True, dilations = [1, 2, 4, 8, 16,32])(x2) #,dilations = [1, 2, 4]
    hidden =add([x1,K.reverse(x2,axes=1)])
    
    hidden = lstm_layer(units, dropout)(hidden)
    #hidden = gru_layer(units, dropout)(hidden)
    truncated = hidden[:, :pred_len]
    
    out = Dense(5, activation='linear')(truncated)
    model = Model(inputs=inputs, outputs=out)
    adam = tf.optimizers.Adam()
    model.compile(optimizer=adam, loss=MCRMSE)


    return model

In [None]:
plot_model(build_model(),show_shapes=True)

In [None]:
def train_and_predict(n_folds=5, model_name="model", epochs=90, debug=True):

    print("Model:", model_name)

    ensemble_preds = pd.DataFrame(index=sample_sub.index, columns=target_cols).fillna(0) # test dataframe with 0 values
    kf = KFold(n_folds, shuffle=True, random_state=42)
    skf = StratifiedKFold(n_folds, shuffle=True, random_state=42)
    val_losses = []
    historys = []

    for i, (train_index, val_index) in enumerate(skf.split(train_inputs_all, train['SN_filter'])):
        print("Fold:", str(i+1))

        model_train = build_model()
        model_short = build_model(seq_len=107, pred_len=107)
        model_long = build_model(seq_len=130, pred_len=130)

        train_inputs, train_labels = train_inputs_all[train_index], train_labels_all[train_index]
        val_inputs, val_labels = train_inputs_all[val_index], train_labels_all[val_index]

        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'{model_name}.h5')
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=2)

        history = model_train.fit(
            train_inputs , train_labels, 
            validation_data=(val_inputs,val_labels),
            batch_size=64,
            epochs=epochs, # changed 70
            callbacks=[tf.keras.callbacks.ReduceLROnPlateau(),checkpoint,early_stopping],
            verbose=2 if debug else 0
        )

        print(f"{model_name} Min training loss={min(history.history['loss'])}, min validation loss={min(history.history['val_loss'])}")

        val_losses.append(min(history.history['val_loss']))
        historys.append(history)

        model_short.load_weights(f'{model_name}.h5')
        model_long.load_weights(f'{model_name}.h5')

        public_preds = model_short.predict(public_inputs)
        private_preds = model_long.predict(private_inputs)

        preds_model = []
        for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
            for i, uid in enumerate(df.id):
                single_pred = preds[i]

                single_df = pd.DataFrame(single_pred, columns=target_cols)
                single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

                preds_model.append(single_df)

        preds_model_df = pd.concat(preds_model)
        ensemble_preds[target_cols] += preds_model_df[target_cols].values / n_folds

        if debug:
            print("Intermediate ensemble result")
            print(ensemble_preds[target_cols].head())

    ensemble_preds["id_seqpos"] = preds_model_df["id_seqpos"].values
    ensemble_preds = pd.merge(sample_sub["id_seqpos"], ensemble_preds, on="id_seqpos", how="left")

    print("Mean Validation loss:", str(np.mean(val_losses)))

    if debug:
        fig, ax = plt.subplots(1, 1, figsize = (10, 5))
        for i, history in enumerate(historys):
            ax.plot(history.history['loss'])
            ax.plot(history.history['val_loss'])
            ax.set_title('model_'+str(i+1))
            ax.set_ylabel('Loss')
            ax.set_xlabel('Epoch')
        plt.show()

    return ensemble_preds


In [None]:
public_df = test.query("seq_length == 107").copy()
private_df = test.query("seq_length == 130").copy()
public_inputs = preprocess_inputs(public_df)
private_inputs = preprocess_inputs(private_df)
ensembles = []
model_name = "model"

ensemble_final = train_and_predict(n_folds=5, model_name=model_name, epochs=100)
print(ensemble_final)

In [None]:
ensemble_final.to_csv('ensemble_final.csv', index=False)