In [None]:
import pandas as pd
import numpy as np
import json
import tensorflow.keras.layers as L

Competition Overview
In this new competition we are helping to fight against the worldwide pandemic COVID-19. mRNA vaccines are the fastest vaccine candidates to treat COVID-19 but they currently facing several limitations. In particular, it is a challenge to design stable messenger RNA molecules. Typical vaccines are packaged in syringes and shipped under refrigeration around the world, but that is not possible for mRNA vaccines (currently).

Researches have noticed that RNA molecules tend to spontaneously degrade, which is highly problematic because a single cut can render mRNA vaccines useless. Not much is known about which part of the backbone of a particular RNA is most susceptible to being damaged.

Without this knowledge, the current mRNA vaccines are shopped under intense refrigeration and are unlikely to reach enough humans unless they can be stabilized. This is our task as Kagglers: we must create a model to predict the most likely degradation rates at each base of an RNA molecule.

We are given a subset of an Eterna dataset comprised of over 3000 RNA molecules and their degradation rates at each position. Our models are then tested on the new generation of RNA sequences that were just created by Eterna players for COVID-19 mRNA vaccines

Before we get started, please check out other's notebook here as this one is based on other: I just added comments, made minor code changes, an LSTM, and fold training:

In [None]:
train = pd.read_json("../input/stanford-covid-vaccine/train.json", lines=True)
test = pd.read_json("../input/stanford-covid-vaccine/test.json", lines=True)
sample_df = pd.read_csv("../input/stanford-covid-vaccine/sample_submission.csv")

# Basic EDA

In [None]:
train.head()

In [None]:
!pip install datasist

## Merge Train and Test for Feature Engineering

In [None]:
import datasist as ds
ds.structdata.check_train_test_set(train, test, index=None, col=None)

In [None]:
# ds.structdata.describe(train)

In [None]:
train.info()

In [None]:
train.describe()

## Missingno - Automated ML library for some feature engineering (can also be used in EDA)

In [None]:
import missingno as msno
msno.matrix(train)

In [None]:
msno.bar(train)

In [None]:
pd.set_option('max_columns', 100)
train

In [None]:
len(train['structure'][1])

In [None]:
length = []
for struct in train['structure']:
    length.append(len(struct))

In [None]:
length

In [None]:
train["flag"] = "train"
test["flag"] = "test"

In [None]:
# !pip install datasist
import datasist as ds

In [None]:
all_data, ntrain, ntest = ds.structdata.join_train_and_test(train, test)
# #later splitting after transformations
# train_new = all_data[:ntrain]
# test_new = all_data[ntrain:]

In [None]:
all_data

In [None]:
count = 0
listof = []
for data in all_data['predicted_loop_type']:
    for letter in str(data):
        if letter == "S":
            count += 1
#     listof.append(count)

## Adding S, M, I, B, H, X columns based off of the number of corresponding letters in predicted_loop_type column

In [None]:
all_data["S"] = all_data['predicted_loop_type'].str.count("S")
all_data["M"] = all_data['predicted_loop_type'].str.count("M")
all_data["I"] = all_data['predicted_loop_type'].str.count("I")
all_data["B"] = all_data['predicted_loop_type'].str.count("B")
all_data["H"] = all_data['predicted_loop_type'].str.count("H")
all_data["X"] = all_data['predicted_loop_type'].str.count("X")

In [None]:
all_data

### Unmerge

In [None]:
train = all_data[:ntrain]
test = all_data[ntrain:]

In [None]:
# train['S'] = 
train['predicted_loop_type']

In [None]:
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
train[pred_cols]

In [None]:
y = train[pred_cols]

In [None]:
y = np.array(train[train.signal_to_noise > 1][pred_cols].values.tolist()).transpose((0, 2, 1))

In [None]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}
def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

In [None]:
X = preprocess_inputs(train[train.signal_to_noise > 1])

In [None]:
X

# Model

In [None]:
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras import layers as L
from tensorflow.keras.callbacks import *
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential

In [None]:
test.shape

In [None]:
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(hidden_dim, dropout=dropout, return_sequences=True))

def build_model(seq_len=107, pred_len=68, dropout=0.5, embed_dim=100, hidden_dim=128):
    inputs = L.Input(shape=(seq_len, 3))

    embed = L.Embedding(input_dim=len(token2int), output_dim=embed_dim)(inputs)
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))

    hidden = gru_layer(hidden_dim, dropout)(reshaped)
    hidden = gru_layer(hidden_dim, dropout)(hidden)
    
    # Since we are only making predictions on the first part of each sequence, we have
    # to truncate it
    truncated = hidden[:, :pred_len]
    out1 = L.BatchNormalization()(truncated)
    out = L.Dense(5, activation='linear')(out1)

    model = tf.keras.Model(inputs=inputs, outputs=out)

    model.compile(tf.keras.optimizers.Adam(), loss='mse')
    
    return model
model = build_model()

In [None]:
model.summary()

In [None]:
history = model.fit(
    X, y, 
    batch_size=64,
    epochs=150,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ],
    validation_split=0.25
)

In [None]:
# import pandas as pd
# loss = pd.DataFrame({loss: model.history.history["loss"], acc: model.history.history["val_loss"] })

In [None]:
public_df = test.query("seq_length == 107").copy()
private_df = test.query("seq_length == 130").copy()

public_inputs = preprocess_inputs(public_df)
private_inputs = preprocess_inputs(private_df)

In [None]:
# although it's not the case for the training data.
model_short = build_model(seq_len=107, pred_len=107)
model_long = build_model(seq_len=130, pred_len=130)

model_short.load_weights('model.h5')
model_long.load_weights('model.h5')

public_preds = model_short.predict(public_inputs)
private_preds = model_long.predict(private_inputs)

In [None]:
preds_ls = []

for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)

In [None]:
sample_df = pd.read_csv('/kaggle/input/stanford-covid-vaccine/sample_submission.csv')

In [None]:
submission = sample_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
submission.to_csv('submission1234.csv', index=False)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1, figsize = (20, 10))

ax[0].plot(history.history['loss'])
ax[0].plot(history.history['val_loss'])


ax[0].set_title('GRU')

ax[0].legend(['train', 'validation'], loc = 'upper right')

ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epoch')