In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import tensorflow.keras.layers as L
import tensorflow as tf
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from tensorboard.plugins.hparams import api as hp
%load_ext tensorboard
# clear previous logs
# !rm -rf ./logs/

In [None]:
# Load data
data_dir = '/kaggle/input/stanford-covid-vaccine/'
data = pd.read_json(data_dir + 'train.json', lines=True)
test = pd.read_json(data_dir + 'test.json', lines=True)
sample_sub = pd.read_csv(data_dir + 'sample_submission.csv')

In [None]:
print(data.shape)
if ~data.isnull().values.any():
    print("No missing values.")
data.head()

In [None]:
# Signal-to-noise and SN filter columns useful for incorporating the quality of the samples
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
sns.kdeplot(data['signal_to_noise'], shade=True, ax=ax[0])
sns.countplot(data['SN_filter'], ax=ax[1])

ax[0].set_title('Signal/Noise Distribution')
ax[1].set_title('Signal/Noise Filter Distribution');

Pre-Processing

In [None]:
# ensure reproducibility
tf.random.set_seed(2020)
np.random.seed(2020)

In [None]:
# prediction columns
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']

In [None]:
# convert dataframe to a numpy array
def df_to_array(df):
    return np.transpose(np.array(df.values.tolist()), (0, 2, 1))

In [None]:
# change letter variables to numeric
def preprocess(df, token2int, cols=['sequence', 'structure', 'predicted_loop_type']):
    return df_to_array(df[cols].applymap(lambda seq: [token2int[x] for x in seq]))

In [None]:
# create dictionary of integer values for sequence, structure, and loop type
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

In [None]:
# split data into training and test sets
train, test = train_test_split(data, test_size=0.2, random_state=2020)

In [None]:
# preprocess the training data
train = train.query("signal_to_noise >= 1")
train_input = preprocess(train, token2int)
train_pred = df_to_array(train[pred_cols])

In [None]:
# split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(train_input, train_pred, test_size=.1, random_state=34, stratify=train.SN_filter)

In [None]:
# process test set
test_input = preprocess(test, token2int)

In [None]:
# evaluation metric
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [None]:
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(
        hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))

In [None]:
def build_model(embed_size, seq_len=107, pred_len=68, dropout=0.5, 
                sp_dropout=0.2, embed_dim=200, hidden_dim=256, n_layers=3):
    inputs = L.Input(shape=(seq_len, 3))
    embed = L.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)
    reshaped = tf.reshape(embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))
    hidden = L.SpatialDropout1D(sp_dropout)(reshaped)
    
    for x in range(n_layers):
        hidden = gru_layer(hidden_dim, dropout)(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :pred_len]
    out = L.Dense(5, activation="linear")(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(), loss=MCRMSE)
    
    return model

In [None]:
# build GRU model
model = build_model(embed_size=len(token2int), dropout=0.2, sp_dropout=0.5, n_layers=5)
model.summary()

In [None]:
# train model
history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=64,
    epochs=75,
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

In [None]:
# plot training and validation loss
fig = px.line(
    history.history, y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

In [None]:
# make predictions
test_pred = model.predict(test_input)

In [None]:
# print score
test_true = df_to_array(test[pred_cols])
print(tf.reduce_mean(MCRMSE(test_true, test_pred)))

In [None]:
# modify model building for hyperparameter tuning
def train_test_model(hparams, embed_size, test_input=test_input, test_target=test[pred_cols], seq_len=107,
                     pred_len=68, dropout=0.5, sp_dropout=0.2, embed_dim=200, hidden_dim=256, n_layers=3, 
                     x_train=x_train, y_train=y_train, x_val=x_val, y_val=y_val):
    inputs = L.Input(shape=(seq_len, 3))
    embed = L.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)
    reshaped = tf.reshape(embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))
    hidden = L.SpatialDropout1D(hparams[HP_SP_DROPOUT])(reshaped)
    
    for x in range(hparams[HP_NUM_LAYERS]):
        hidden = gru_layer(hidden_dim, hparams[HP_DROPOUT])(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :pred_len]
    out = L.Dense(5, activation="linear")(truncated)
        
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(hparams[HP_OPTIMIZER], loss=MCRMSE)
    
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=25)
    test_pred = model.predict(test_input)
    test_true = df_to_array(test[pred_cols])
    mcrmse = tf.reduce_mean(MCRMSE(test_true, test_pred))
    return mcrmse

In [None]:
# run model and create log
def run(run_dir, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)  # record the values used in this trial
        accuracy = train_test_model(hparams, embed_size=len(token2int))
        tf.summary.scalar("MCRMSE", accuracy, step=1)

In [None]:
# parameters to tune for grid search
HP_DROPOUT = hp.HParam('dropout', hp.RealInterval(0.1, 0.5))
HP_SP_DROPOUT = hp.HParam('sp_dropout', hp.RealInterval(0.1, 0.5))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd', "rmsprop"]))
HP_NUM_LAYERS = hp.HParam('n_layers', hp.Discrete([2, 5]))

In [None]:
# grid search
# session_num = 0
# for dropout_rate in (HP_DROPOUT.domain.min_value, HP_DROPOUT.domain.max_value):
#     for optimizer in HP_OPTIMIZER.domain.values:
#         for sp_dropout_rate in (HP_SP_DROPOUT.domain.min_value, HP_SP_DROPOUT.domain.max_value):
#             for num_layer in HP_NUM_LAYERS.domain.values:
#                 hparams = {
#                     HP_DROPOUT: dropout_rate,
#                     HP_OPTIMIZER: optimizer,
#                     HP_SP_DROPOUT: sp_dropout_rate,
#                     HP_NUM_LAYERS: num_layer,
#                 }
#                 run_name = "grid_run-%d" % session_num
#                 print('--- Starting trial: %s' % run_name)
#                 print({h.name: hparams[h] for h in hparams})
#                 run('logs/hparam_tuning/' + run_name, hparams)
#                 session_num += 1

In [None]:
# parameters to tune for random search
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.1, 0.2, 0.3, 0.4, 0.5]))
HP_SP_DROPOUT = hp.HParam('sp_dropout', hp.Discrete([0.1, 0.2, 0.3, 0.4, 0.5]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'sgd', "rmsprop"]))
HP_NUM_LAYERS = hp.HParam('n_layers', hp.Discrete([2, 3, 4, 5]))

In [None]:
# random search hyperparameter space 10 times
# for i in range(10):
#     hparams = {
#         HP_DROPOUT: HP_DROPOUT.domain.sample_uniform(),
#         HP_OPTIMIZER: HP_OPTIMIZER.domain.sample_uniform(),
#         HP_SP_DROPOUT: HP_SP_DROPOUT.domain.sample_uniform(),
#         HP_NUM_LAYERS: HP_NUM_LAYERS.domain.sample_uniform(),
#     }
#     run_name = "random_run-%d" % i
#     print('--- Starting trial: %s' % run_name)
#     print({h.name: hparams[h] for h in hparams})
#     run('logs/hparam_tuning/' + run_name, hparams)

In [None]:
# view tuning logs
# %tensorboard --logdir='logs\hparam_tuning'

In [None]:
# build GRU model with new hyperparameters
final_model = build_model(embed_size=len(token2int), dropout=0.2, sp_dropout=0.1, n_layers=4)
final_model.summary()

In [None]:
# train new model
final_history = final_model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=64,
    epochs=75,
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

In [None]:
# make new loss plot
fig = px.line(
    final_history.history, y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

In [None]:
# make predictions
test_pred = final_model.predict(test_input)

In [None]:
test_true = df_to_array(test[pred_cols])
print(tf.reduce_mean(MCRMSE(test_true, test_pred)))