In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_json("../input/stanford-covid-vaccine/train.json", lines=True)
test = pd.read_json("../input/stanford-covid-vaccine/test.json", lines=True)
sample_df = pd.read_csv("../input/stanford-covid-vaccine/sample_submission.csv")

In [None]:
train

In [None]:
!pip install datasist
import datasist as ds

In [None]:
all_data, ntrain, ntest = ds.structdata.join_train_and_test(train, test)
# join
# train = all_data[:ntrain]
# test  = all_datal[ntrain:]

In [None]:
all_data["S"] = all_data['predicted_loop_type'].str.count("S")
all_data["M"] = all_data['predicted_loop_type'].str.count("M")
all_data["I"] = all_data['predicted_loop_type'].str.count("I")
all_data["B"] = all_data['predicted_loop_type'].str.count("B")
all_data["H"] = all_data['predicted_loop_type'].str.count("H")
all_data["X"] = all_data['predicted_loop_type'].str.count("X")

In [None]:
train = all_data[:ntrain]
test  = all_data[ntrain:]

In [None]:
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
train[pred_cols]

In [None]:
y = train[pred_cols]

In [None]:
y = np.array(train[train.signal_to_noise > 1][pred_cols].values.tolist()).transpose((0, 2, 1))

In [None]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}
def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

In [None]:
X = preprocess_inputs(train[train.signal_to_noise > 1])

In [None]:
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler

import tensorflow as tf
from tensorflow.keras import layers as L
from tensorflow.keras.callbacks import *
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
import warnings
warnings.filterwarnings('ignore')

#the basics
import pandas as pd, numpy as np
import math, json, gc, random, os, sys
from matplotlib import pyplot as plt
from tqdm import tqdm

#tensorflow deep learning basics
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L

#for model evaluation
from sklearn.model_selection import train_test_split, KFold

In [None]:
def gru_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.GRU(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

In [None]:
def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.LSTM(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))


In [None]:
# def build_model(gru=1,seq_len=107, pred_len=68, dropout=0.5,
#                 embed_dim=75, hidden_dim=128):
    
#     inputs = tf.keras.layers.Input(shape=(seq_len, 3))

#     embed = tf.keras.layers.Embedding(input_dim=len(token2int), output_dim=embed_dim)(inputs)
#     reshaped = tf.reshape(
#         embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))
    
#     reshaped = tf.keras.layers.SpatialDropout1D(.2)(reshaped)
    
#     if gru==1:
#         hidden = gru_layer(hidden_dim, dropout)(reshaped)
#         hidden = gru_layer(hidden_dim, dropout)(hidden)
#         hidden = gru_layer(hidden_dim, dropout)(hidden)
#         hidden = gru_layer(hidden_dim, dropout)(hidden)
        
#     elif gru==0:
#         hidden = lstm_layer(hidden_dim, dropout)(reshaped)
#         hidden = lstm_layer(hidden_dim, dropout)(hidden)
#         hidden = lstm_layer(hidden_dim, dropout)(hidden)
#         hidden = lstm_layer(hidden_dim, dropout)(hidden)
        
#     elif gru==3:
#         hidden = gru_layer(hidden_dim, dropout)(reshaped)
#         hidden = gru_layer(hidden_dim, dropout)(hidden)
#         hidden = gru_layer(hidden_dim, dropout)(hidden)
#         hidden = lstm_layer(hidden_dim, dropout)(hidden)
        
#     elif gru==4:
#         hidden = lstm_layer(hidden_dim, dropout)(reshaped)
#         hidden = lstm_layer(hidden_dim, dropout)(hidden)
#         hidden = lstm_layer(hidden_dim, dropout)(hidden)
#         hidden = gru_layer(hidden_dim, dropout)(hidden)
    
#     #only making predictions on the first part of each sequence
#     truncated = hidden[:, :pred_len]
    
#     out1 = tf.keras.layers.BatchNormalization()(truncated)
#     out = tf.keras.layers.Dense(5, activation='linear')(out1)

#     model = tf.keras.Model(inputs=inputs, outputs=out)

#     #some optimizers
#     adam = tf.optimizers.Adam()
#     radam = tfa.optimizers.RectifiedAdam()
#     lookahead = tfa.optimizers.Lookahead(adam, sync_period=6)
#     ranger = tfa.optimizers.Lookahead(radam, sync_period=6)
    
#     model.compile(optimizer = adam, loss='mse')
    
#     return model

In [None]:
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(hidden_dim, dropout=dropout, return_sequences=True))

def build_model(seq_len=107, pred_len=68, dropout=0.5, embed_dim=100, hidden_dim=128):
    inputs = L.Input(shape=(seq_len, 3))

    embed = L.Embedding(input_dim=len(token2int), output_dim=embed_dim)(inputs)
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))

    hidden = gru_layer(hidden_dim, dropout)(reshaped)
    hidden = gru_layer(hidden_dim, dropout)(hidden)
    
    # Since we are only making predictions on the first part of each sequence, we have
    # to truncate it
    truncated = hidden[:, :pred_len]
    out1 = L.BatchNormalization()(truncated)
    out = L.Dense(5, activation='linear')(out1)

    model = tf.keras.Model(inputs=inputs, outputs=out)

    model.compile(tf.keras.optimizers.Adam(), loss='mse')
    
    return model
model = build_model()

In [None]:
import os
import json
import gc
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Masking
from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, Dropout
from tensorflow.keras.preprocessing import text, sequence
from tqdm import tqdm_notebook as tqdm
import fasttext

In [None]:
 
    q_in = Input(shape=(None,))
    q = embedding(q_in)
    q = SpatialDropout1D(0.2)(q)
    q = Bidirectional(LSTM(100, return_sequences=True))(q)
    q = GlobalMaxPooling1D()(q)
    
    
    t_in = Input(shape=(None,))
    t = embedding(t_in)
    t = SpatialDropout1D(0.2)(t)
    t = Bidirectional(LSTM(150, return_sequences=True))(t)
    t = GlobalMaxPooling1D()(t)
    
    hidden = concatenate([q, t])
    hidden = Dense(300, activation='relu')(hidden)
    hidden = Dropout(0.5)(hidden)
    hidden = Dense(300, activation='relu')(hidden)
    hidden = Dropout(0.5)(hidden)
    
    out1 = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[t_in, q_in], outputs=out1)
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [None]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(X, y,
                                                                     test_size=.1, random_state=34)

In [None]:
if tf.config.list_physical_devices('GPU') is not None:
    print('Training on GPU')

In [None]:
gru = build_model(gru=1)

In [None]:
gru.summary()

In [None]:
lr_callback = tf.keras.callbacks.ReduceLROnPlateau()

In [None]:
sv_gru = tf.keras.callbacks.ModelCheckpoint('model_gru.h5')

history_gru = gru.fit(
    train_inputs, train_labels, 
    validation_data=(val_inputs,val_labels),
    batch_size=64,
    epochs=100,
    callbacks=[lr_callback,sv_gru],
    verbose = 2
)

print(f"Min training loss={min(history_gru.history['loss'])}, min validation loss={min(history_gru.history['val_loss'])}")

In [None]:
lstm = build_model(gru=0)
sv_lstm = tf.keras.callbacks.ModelCheckpoint('model_lstm.h5')

history_lstm = lstm.fit(
    train_inputs, train_labels, 
    validation_data=(val_inputs,val_labels),
    batch_size=64,
    epochs=100,
    callbacks=[lr_callback,sv_lstm],
    verbose = 2
)

print(f"Min training loss={min(history_lstm.history['loss'])}, min validation loss={min(history_lstm.history['val_loss'])}")

In [None]:
lstm = build_model(gru=3)
sv_lstm = tf.keras.callbacks.ModelCheckpoint('model_hyb1.h5')

history_lstm = lstm.fit(
    train_inputs, train_labels, 
    validation_data=(val_inputs,val_labels),
    batch_size=64,
    epochs=100,
    callbacks=[lr_callback,sv_lstm],
    verbose = 2
)

print(f"Min training loss={min(history_lstm.history['loss'])}, min validation loss={min(history_lstm.history['val_loss'])}")

In [None]:
lstm = build_model(gru=4)
sv_lstm = tf.keras.callbacks.ModelCheckpoint('model_hyb2.h5')

history_lstm = lstm.fit(
    train_inputs, train_labels, 
    validation_data=(val_inputs,val_labels),
    batch_size=64,
    epochs=100,
    callbacks=[lr_callback,sv_lstm],
    verbose = 2
)

print(f"Min training loss={min(history_lstm.history['loss'])}, min validation loss={min(history_lstm.history['val_loss'])}")

In [None]:
public_df = test.query("seq_length == 107").copy()
private_df = test.query("seq_length == 130").copy()

public_inputs = preprocess_inputs(public_df)
private_inputs = preprocess_inputs(private_df)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 2, figsize = (20, 10))

ax[0].plot(history_lstm.history['loss'])
ax[0].plot(history_lstm.history['val_loss'])

ax[0].plot(history_gru.history['loss'])
ax[0].plot(history_gru.history['val_loss'])


ax[0].set_title('GRU')

ax[0].legend(['train', 'validation'], loc = 'upper right')

ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epoch')

In [None]:
#build all models
gru_short = build_model(gru=1, seq_len=107, pred_len=107)
gru_long = build_model(gru=1, seq_len=130, pred_len=130)
lstm_short = build_model(gru=0, seq_len=107, pred_len=107)
lstm_long = build_model(gru=0, seq_len=130, pred_len=130)
hyb1_short = build_model(gru=3, seq_len=107, pred_len=107)
hyb1_long = build_model(gru=3, seq_len=130, pred_len=130)
hyb2_short = build_model(gru=4, seq_len=107, pred_len=107)
hyb2_long = build_model(gru=4, seq_len=130, pred_len=130)


#load pre-trained model weights
gru_short.load_weights('model_gru.h5')
gru_long.load_weights('model_gru.h5')
lstm_short.load_weights('model_lstm.h5')
lstm_long.load_weights('model_lstm.h5')
hyb1_short.load_weights('model_hyb1.h5')
hyb1_long.load_weights('model_hyb1.h5')
hyb2_short.load_weights('model_hyb2.h5')
hyb2_long.load_weights('model_hyb2.h5')

#and predict
gru_public_preds = gru_short.predict(public_inputs)
gru_private_preds = gru_long.predict(private_inputs)
lstm_public_preds = lstm_short.predict(public_inputs)
lstm_private_preds = lstm_long.predict(private_inputs)
hyb1_public_preds = hyb1_short.predict(public_inputs)
hyb1_private_preds = hyb1_long.predict(private_inputs)
hyb2_public_preds = hyb2_short.predict(public_inputs)
hyb2_private_preds = hyb2_long.predict(private_inputs)

In [None]:
preds_gru = []

for df, preds in [(public_df, gru_public_preds), (private_df, gru_private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_gru.append(single_df)

preds_gru_df = pd.concat(preds_gru)
preds_gru_df.head()

In [None]:
preds_lstm = []

for df, preds in [(public_df, lstm_public_preds), (private_df, lstm_private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_lstm.append(single_df)

preds_lstm_df = pd.concat(preds_lstm)
preds_lstm_df.head()

In [None]:
preds_hyb1 = []

for df, preds in [(public_df, hyb1_public_preds), (private_df, hyb1_private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_hyb1.append(single_df)

preds_hyb1_df = pd.concat(preds_hyb1)
preds_hyb1_df.head()

In [None]:
preds_hyb2 = []

for df, preds in [(public_df, hyb2_public_preds), (private_df, hyb2_private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_hyb2.append(single_df)

preds_hyb2_df = pd.concat(preds_hyb2)
preds_hyb2_df.head()


In [None]:
blend_preds_df = pd.DataFrame()
blend_preds_df['id_seqpos'] = preds_gru_df['id_seqpos']
blend_preds_df['reactivity'] = 0.25*preds_gru_df['reactivity'] + 0.25*preds_lstm_df['reactivity'] + 0.25*preds_hyb1_df['reactivity'] + 0.25*preds_hyb2_df['reactivity']
blend_preds_df['deg_Mg_pH10'] = 0.25*preds_gru_df['deg_Mg_pH10'] + 0.25*preds_lstm_df['deg_Mg_pH10'] + 0.25*preds_hyb1_df['deg_Mg_pH10'] + 0.25*preds_hyb2_df['deg_Mg_pH10']
blend_preds_df['deg_pH10'] = 0.25*preds_gru_df['deg_pH10'] + 0.25*preds_lstm_df['deg_pH10'] + 0.25*preds_hyb1_df['deg_pH10'] + 0.25*preds_hyb2_df['deg_pH10']
blend_preds_df['deg_Mg_50C'] = 0.25*preds_gru_df['deg_Mg_50C'] + 0.25*preds_lstm_df['deg_Mg_50C'] + 0.25*preds_hyb1_df['deg_Mg_50C'] + 0.25*preds_hyb2_df['deg_Mg_50C']
blend_preds_df['deg_50C'] = 0.25*preds_gru_df['deg_50C'] + 0.25*preds_lstm_df['deg_50C'] + 0.25*preds_hyb1_df['deg_50C'] + 0.25*preds_hyb2_df['deg_Mg_50C']

In [None]:
submission = sample_df[['id_seqpos']].merge(blend_preds_df, on=['id_seqpos'])

#sanity check
submission.head()


In [None]:
submission.to_csv('submission.csv', index=False)
print('Submission saved')


In [None]:
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

# function that takes in a dataframe and creates a text link to  
# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# create a random sample dataframe
df = pd.DataFrame(np.random.randn(50, 4), columns=list('ABCD'))

# create a link to download the dataframe
create_download_link(df)