**This is based on [this notebook](https://www.kaggle.com/xhlulu/openvaccine-simple-gru-model) by [xhlulu](https://www.kaggle.com/xhlulu) and added few functions. Here I have combined 2 new features, sequence and predicted loop type.**

**Example: Sequence='xyz...' predicte_loop_type='abc....' then new feature is 'xaybzc...' where xa,yb, zc is tokenized. This could have potential positive impact in training if merged sequence have better correlation with the reactivity and deg_* variables. In coming days I will be experimenting with more merge feature.**

In [None]:
import json

import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow.keras.layers as L
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Set seed to ensure reproducibility

In [None]:
tf.random.set_seed(19)
np.random.seed(19)

## Helper functions and useful variables

In [None]:
# This will tell us the columns we are predicting
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']

In [None]:
y_true = tf.random.normal((32, 68, 3))
y_pred = tf.random.normal((32, 68, 3))

In [None]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [None]:
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(
        hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))

In [None]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0,2, 1)
    )

In [None]:
def preprocess_inputs(df, token2int, cols=['sequence', 'structure', 'predicted_loop_type','seq_loop']):
    return pandas_list_to_array(
        df[cols].applymap(lambda seq: [token2int[x] for x in seq])
    )

In [None]:
def addExtraCol(df):
    seq_list=list(df['sequence'])
    pre_list=list(df['predicted_loop_type'])
    seq_loop_list=[]
    for i in range(len(seq_list)):
        tmp=''
        for j in range(len(seq_list[i])):
            tmp=tmp+merged_seq[seq_list[i][j]+pre_list[i][j]]
        seq_loop_list.append(tmp)

    df['seq_loop']=seq_loop_list
    return df

## Load and preprocess data

In [None]:
data_dir = '/kaggle/input/stanford-covid-vaccine/'
train = pd.read_json(data_dir + 'train.json', lines=True)
test = pd.read_json(data_dir + 'test.json', lines=True)
sample_df = pd.read_csv(data_dir + 'sample_submission.csv')


[](http://)

**Sequence has 4 unique chracters and predicted loop types contains 7 unique character. We can use characters abcde....xyz12 to reprsent 28 characters**

In [None]:
Sequence=['A', 'G', 'U', 'C']
Predicted_loop_types= ['S','M','I','B','H','E','X']
chars='abcdefghijklmnopqrstuvwxyz12'
merged_seq={}
i=0
for s in Sequence:
    for p in Predicted_loop_types:
        merged_seq[s+p]=chars[i]
        i=i+1
        
print(merged_seq)

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train = train.query("signal_to_noise >= 1")
train.shape

**Adding new coolum seq_loop which pairwise string of sequence and predicted loop type**

In [None]:
train=addExtraCol(train)
train.head()

In [None]:
# We will use this dictionary to map each character to an integer
# so that it can be used as an input in keras
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}
print(len(token2int))

i=14
for k,v in merged_seq.items():
    token2int[v]=i
    i=i+1
    
    
print(token2int)

train_inputs = preprocess_inputs(train, token2int)
train_labels = pandas_list_to_array(train[pred_cols])

In [None]:
train_inputs.shape

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    train_inputs, train_labels, test_size=.1, random_state=34, stratify=train.SN_filter)

Public and private sets have different sequence lengths, so we will preprocess them separately and load models of different tensor shapes.

In [None]:
test=addExtraCol(test)
public_df = test.query("seq_length == 107")
private_df = test.query("seq_length == 130")


public_inputs = preprocess_inputs(public_df, token2int)
private_inputs = preprocess_inputs(private_df, token2int)

## Build and train model

We will train a bi-directional GRU model. It has three layer and has dropout. To learn more about RNNs, LSTM and GRU, please see [this blog post](https://colah.github.io/posts/2015-08-Understanding-LSTMs/).

In [None]:
def build_model(embed_size, 
                seq_len=107, 
                pred_len=68, 
                dropout=0.4, 
                sp_dropout=0.2,
                embed_dim=200, 
                hidden_dim=256, 
                n_layers=3):
    
    inputs = L.Input(shape=(seq_len, 4))
    embed = L.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)
    
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )
    hidden = L.SpatialDropout1D(sp_dropout)(reshaped)
    
    for x in range(n_layers):
        hidden = gru_layer(hidden_dim, dropout)(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :pred_len]
    out = L.Dense(5, activation='linear')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(learning_rate=0.0025,
    beta_1=0.8,
    beta_2=0.999,
    epsilon=1e-07), loss=MCRMSE)
    
    return model

In [None]:
tf.keras.backend.clear_session()
model = build_model(embed_size=len(token2int))
model.summary()

In [None]:
history = model.fit(
    x_train, y_train,
    #sample_weight=x_train_sn,
    validation_data=(x_val, y_val),
    batch_size=64,
    epochs=70,
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

In [None]:
fig = px.line(
    history.history, y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

## Evaluate training history

Let's use Plotly to quickly visualize the training and validation loss throughout the epochs.

## Load models and make predictions

Public and private sets have different sequence lengths, so we will preprocess them separately and load models of different tensor shapes. This is possible because RNN models can accept sequences of varying lengths as inputs.

In [None]:
# Caveat: The prediction format requires the output to be the same length as the input,
# although it's not the case for the training data.
model_public = build_model(seq_len=107, pred_len=107, embed_size=len(token2int))
model_private = build_model(seq_len=130, pred_len=130, embed_size=len(token2int))

model_public.load_weights('model.h5')
model_private.load_weights('model.h5')

In [None]:
public_preds = model_public.predict(public_inputs)
private_preds = model_private.predict(private_inputs)

## Post-processing and submit

For each sample, we take the predicted tensors of shape (107, 5) or (130, 5), and convert them to the long format (i.e. $629 \times 107, 5$ or $3005 \times 130, 5$):

In [None]:
preds_ls = []

for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)
preds_df.head()

In [None]:
submission = sample_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
submission.to_csv('submission.csv', index=False)