In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm

In [None]:
from datetime import datetime

def log_now():
    print(datetime.now())

In [None]:
TEST = False
TEST = True # uncomment to test that all the notebook is ok before commit

# MODEL

In [None]:
import tensorflow as tf

import keras.backend as K

from keras.layers import Input, Dense, Bidirectional, Conv1D, SpatialDropout1D, Embedding, Concatenate, GRU, Cropping1D, LSTM, AveragePooling1D, ZeroPadding1D
from tensorflow.keras.activations import swish
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# https://www.kaggle.com/c/stanford-covid-vaccine/discussion/183211
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

class BiGRUModel():
    
    def __init__(self, testmode):
        self.type = 'BiGru'
        self.n_features = 3
        
        self.lr = 0.0015
        self.epochs = 5 if testmode else 100
        self.batch_size = 32
        
        self.train_verbose = 1 if testmode else 0

        self.checkpoint = ModelCheckpoint(self.type + ".hdf5", 
                                          monitor='val_loss',
                                          verbose=self.train_verbose,
                                          save_best_only=True,
                                          mode='auto',
                                          period=1)
        self.es = EarlyStopping(monitor='val_loss', patience = 10, mode = 'min', restore_best_weights=True)
        self.reduce_lr = ReduceLROnPlateau(patience=5)

    def create_model(self):
        # based on https://www.kaggle.com/tuckerarrants/openvaccine-gru-lstm
        seq_len = 107
        pred_len = 68
        seq_dim = 14
        ltype_dim = 14
        structure_dim = 14
        embed_dim = 200
        dropout = .2
        sp_dropout = dropout
        conv_dim = 512
        conv_ksize = 3
        hidden_dim = 256
        crop = (0,seq_len-pred_len)
        out_dim = 5
        
        iseq = Input(shape = (seq_len))
        iltype = Input(shape = (seq_len))
        istructure = Input(shape = (seq_len))
        ibpp = Input(shape= (seq_len,seq_len))
        
        eseq = Embedding(input_dim=seq_dim,output_dim=embed_dim)(iseq)
        eltype = Embedding(input_dim=ltype_dim,output_dim=embed_dim)(iltype)
        estructure = Embedding(input_dim=structure_dim,output_dim=embed_dim)(istructure)
        
        x = Concatenate(axis=2)([eseq,eltype,estructure,ibpp])
        x = ZeroPadding1D(padding=(0,29))(x)

        x = SpatialDropout1D(sp_dropout)(x)
        x = Conv1D(conv_dim, conv_ksize, padding='same', activation=swish)(x)
        
        x = Bidirectional(GRU(hidden_dim, dropout=dropout, return_sequences=True))(x)        
        x = Bidirectional(GRU(hidden_dim, dropout=dropout, return_sequences=True))(x)        
        x = Bidirectional(LSTM(hidden_dim, dropout=dropout, return_sequences=True))(x)        
        
        x = AveragePooling1D(pool_size=2)(x)

        #x = Cropping1D(cropping=crop)(x)
        
        out = Dense(out_dim, activation='linear')(x)

        self.model = Model(inputs = [iseq, iltype, istructure, ibpp], outputs = out)
    
    def compile_model(self):
        opt = Adam(lr = self.lr)
        self.model.compile(loss = MCRMSE, optimizer = opt)
    
    def create_and_compile(self):
        if self.train_verbose == 1:
            print('Create Model...')
        self.create_model()
        
        if self.train_verbose == 1:
            print('Compile Model...')
        self.compile_model()
        
        if self.train_verbose == 1:
            self.print()


    def print(self):
        print(self.model.summary())
        
    def fit(self, X_seq, X_ltype, X_structure, X_bpp, Y):
        X_seq_train, X_seq_valid, X_ltype_train, X_ltype_valid, X_structure_train, X_structure_valid, X_bpp_train, X_bpp_valid, Y_train, Y_valid = train_test_split(X_seq, X_ltype, X_structure, X_bpp, Y)

        self.history = self.model.fit([X_seq_train, X_ltype_train, X_structure_train, X_bpp_train],
                                      Y_train,
                                      validation_data = ([X_seq_valid, X_ltype_valid, X_structure_valid, X_bpp_valid], Y_valid),
                                      epochs = self.epochs,
                                      batch_size = self.batch_size,
                                      callbacks = [self.checkpoint, self.es, self.reduce_lr],
                                      verbose = self.train_verbose)

    def predict(self, X_seq, X_ltype, X_structure, X_bpp):
        return self.model.predict([X_seq, X_ltype, X_structure, X_bpp])
    
    def load_weights(self):
        self.model.load_weights(self.type + ".hdf5")
    
    def plot(self):
        plt.figure(figsize=(20,5))

        # summarize history for loss
        plt.plot(self.history.history['loss'])
        plt.plot(self.history.history['val_loss'])
        plt.title('model loss')
        plt.ylabel('loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')

        plt.show()

# DATA

In [None]:
train = pd.read_json('/kaggle/input/stanford-covid-vaccine/train.json', lines=True)
test = pd.read_json('/kaggle/input/stanford-covid-vaccine/test.json', lines=True)

In [None]:
if TEST:
    train = train.head(32*40)

tlen = train.shape[0]

# FEATURES

In [None]:
def get_bpp(id):
    return np.load('../input/stanford-covid-vaccine/bpps/' + id + '.npy')

In [None]:
from sklearn import preprocessing

E = preprocessing.LabelEncoder()
E.fit([b'S', b'M', b'I', b'B', b'H', b'E', b'X', b'.', b'(', b')',b'A', b'C', b'G', b'U'])

def encode(code):
    return E.transform(np.array(code, 'c'))

def generate_X_data(train):
    x_seq = np.empty((0,107))
    x_ltype = np.empty((0,107))
    x_structure = np.empty((0,107))
    x_bpp = np.empty((0,107,107))
    for index, row in tqdm(train.iterrows(), total=train.shape[0]):
        x_seq = np.append(x_seq, [encode(row.sequence)], axis=0)
        x_ltype = np.append(x_ltype, [encode(row.predicted_loop_type)], axis=0)
        x_structure = np.append(x_structure, [encode(row.structure)], axis=0)
        x_bpp = np.append(x_bpp, [get_bpp(row.id)], axis=0)
                
    return x_seq, x_ltype, x_structure, x_bpp

In [None]:
def generate_Y_data(train):
    Y = np.empty((0,68,5))
    for index, row in tqdm(train.iterrows(), total=train.shape[0]):
        Y = np.append(Y, [np.array([row.reactivity,row.deg_Mg_pH10,row.deg_pH10,row.deg_Mg_50C,row.deg_50C]).T], axis=0)
    return Y

In [None]:
log_now()

In [None]:
X_seq, X_ltype, X_structure, X_bpp = generate_X_data(train)

assert X_seq.shape == (tlen,107)
assert X_ltype.shape == (tlen,107)
assert X_structure.shape == (tlen,107)
assert X_bpp.shape == (tlen,107,107)

In [None]:
Y = generate_Y_data(train)
assert Y.shape == (tlen,68,5)


In [None]:
assert X_seq.shape[0] == X_ltype.shape[0] == X_structure.shape[0] == X_bpp.shape[0] == Y.shape[0]

# FIT

In [None]:
log_now()

In [None]:
m = BiGRUModel(TEST)
m.create_and_compile()

In [None]:
m.fit(X_seq, X_ltype, X_structure, X_bpp, Y)
m.plot()

# EVALUATE

In [None]:
log_now()

In [None]:
m.load_weights()

In [None]:
Y_pred = m.predict(X_seq, X_ltype, X_structure, X_bpp)
print("MCRMSE  = ", np.mean(MCRMSE(Y,Y_pred)))

# PREDICT

In [None]:
log_now()

In [None]:
submission = pd.DataFrame({
                "id_seqpos" :  [],
                "reactivity":  [],
                "deg_Mg_pH10": [],
                "deg_pH10":    [],
                "deg_Mg_50C":  [],
                "deg_50C":     []
            })

In [None]:
for index, row in tqdm(test.iterrows(), total=test.shape[0]):
    X_seq = np.array([encode(row.sequence)])[:,:107]
    X_ltype = np.array([encode(row.predicted_loop_type)])[:,:107]
    X_structure = np.array([encode(row.structure)])[:,:107]
    X_bpp = np.array([get_bpp(row.id)])[:,:107,:107]

    predicted = m.predict(X_seq, X_ltype, X_structure, X_bpp)

    df = pd.DataFrame(data=predicted[0], columns=["reactivity", "deg_Mg_pH10", "deg_pH10", "deg_Mg_50C", "deg_50C"])
    dfz = pd.DataFrame(data=np.zeros((row.seq_length-df.shape[0],5)), columns=["reactivity", "deg_Mg_pH10", "deg_pH10", "deg_Mg_50C", "deg_50C"])
    df = df.append(dfz) # complete with zeros
    df = df.reset_index(drop=True)
    df["id_seqpos"] = df.apply(lambda r : f"{row.id}_{r.name}", axis = 1) 

    submission = submission.append(df)


### Checking file before submission

In [None]:
assert submission.shape[0] == 457953

In [None]:
submission.to_csv("submission.csv", index = False)

In [None]:
log_now()