In [None]:
from tensorflow.keras.layers import (Dense,Input,Conv1D,Dropout,LSTM,Bidirectional,GRU,SpatialDropout1D,Embedding,Activation,concatenate,AveragePooling1D,
                                    MaxPooling1D,BatchNormalization,GlobalMaxPooling1D,GlobalAveragePooling1D,add,PReLU,Flatten,TimeDistributed,Reshape)
from tensorflow.keras import regularizers
from tensorflow import reshape
from tensorflow.keras.utils import plot_model
import matplotlib.pyplot as plt
import tensorflow as tf
import random
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
import gc
from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.cluster import KMeans

import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def seed_everything(seed = 34):
    os.environ['PYTHONHASHSEED']=str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything()

In [None]:
train=pd.read_json('/kaggle/input/stanford-covid-vaccine/train.json',lines=True).drop('index',axis=1)
test=pd.read_json('/kaggle/input/stanford-covid-vaccine/test.json',lines=True).drop('index',axis=1)
sample_sub=pd.read_csv('/kaggle/input/stanford-covid-vaccine/sample_submission.csv')

In [None]:
targets = ['reactivity','deg_Mg_pH10','deg_pH10','deg_Mg_50C', 'deg_50C']

In [None]:
train

In [None]:
def read_bpps_sum(df):
    bpps_arr = []
    for mol_id in df.id.to_list():
        bpps_arr.append(np.load(f"../input/stanford-covid-vaccine/bpps/{mol_id}.npy").sum(axis=1))
    return bpps_arr

def read_bpps_max(df):
    bpps_arr = []
    for mol_id in df.id.to_list():
        bpps_arr.append(np.load(f"../input/stanford-covid-vaccine/bpps/{mol_id}.npy").max(axis=1))
    return bpps_arr

def read_bpps_nb(df):
    #mean and std from https://www.kaggle.com/symyksr/openvaccine-deepergcn 
    bpps_nb_mean = 0.077522
    bpps_nb_std = 0.08914
    bpps_arr = []
    for mol_id in df.id.to_list():
        bpps = np.load(f"../input/stanford-covid-vaccine/bpps/{mol_id}.npy")
        bpps_nb = (bpps > 0).sum(axis=0) / bpps.shape[0]
        bpps_nb = (bpps_nb - bpps_nb_mean) / bpps_nb_std
        bpps_arr.append(bpps_nb)
    return bpps_arr 

train['bpps_sum'] = read_bpps_sum(train)
test['bpps_sum'] = read_bpps_sum(test)
train['bpps_max'] = read_bpps_max(train)
test['bpps_max'] = read_bpps_max(test)
train['bpps_nb'] = read_bpps_nb(train)
test['bpps_nb'] = read_bpps_nb(test)

#sanity check
train.head()

In [None]:
def one_hot_char(s,enum={c : i for i, c in enumerate('ACGUBEHIMSX.()')}):
    one_hot_s = []
    for i in range(len(s)):
        one_hot_c=np.zeros(len(enum))
        one_hot_c[enum[s[i]]] = 1
        one_hot_s.append(one_hot_c)
    return one_hot_s

one_hot_char("GAAAGCUAGGACGUGG")

def decode_one_hot(arr, enum={c : i for i, c in enumerate('ACGUBEHIMSX.()')}):
    s=[]
    inv_enum= { i : c for c, i in enum.iteritems()}
    for i in range(len(arr)):
        s+=inv_enum
                        

In [None]:
textencoding={c : i for i, c in enumerate('ACGUBEHIMSX.()')}
textencoding

In [None]:
def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [textencoding[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

In [None]:
def denoise(df,tresh=0.25):
    df=df[df['signal_noise' > tresh]]

In [None]:
train_inputs = preprocess_inputs(train)
train_labels = np.array(train[targets].values.tolist()).transpose((0, 2, 1))

In [None]:
train_inputs.shape

In [None]:
from tensorflow import keras
import keras.backend as K

def rmse(y_actual, y_pred):
    mse = keras.losses.mean_squared_error(y_actual, y_pred)
    return K.sqrt(mse)

def mcrmse(y_actual, y_pred, num_scored=len(targets)):
    score = 0
    for i in range(num_scored):
        score += rmse(y_actual[:, :, i], y_pred[:, :, i]) / num_scored
    return score

In [None]:
def build_model(one_hot = False, conv_bias_reg = regularizers.l2(0.00001), conv_kern_reg = regularizers.l2(0.00001), 
                embed = 120, lstm=100, dropout=0.3, opt='adam', input_length=68, seq_len=107, pred_len=68):
 
    
    inputs = Input((seq_len, 3))
    emb=Embedding(len(textencoding), embed, input_length=input_length, trainable=True)(inputs)
    
    reshaped = reshape(emb, shape=(-1, emb.shape[1],  emb.shape[2] * emb.shape[3]))  
    
    dropout_layer=SpatialDropout1D(0.2)(reshaped)

    conv_1=Conv1D(512, 3,padding='same',kernel_initializer='he_uniform',
                 kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg) (dropout_layer)
    batch_1=BatchNormalization()(conv_1)
    act_1=Activation('relu')(batch_1) 
    max_pool_1=MaxPooling1D(pool_size=2,strides=1,padding='same') (act_1)

        
    lstm_1 = Bidirectional(LSTM(lstm, return_sequences=True, dropout=dropout, kernel_initializer='orthogonal'))(act_1)
    
    lstm_2 = Bidirectional(LSTM(lstm, return_sequences=True, dropout=dropout, kernel_initializer='orthogonal'))(lstm_1)
    
    lstm_3 = Bidirectional(LSTM(lstm, return_sequences=True, dropout=dropout, kernel_initializer='orthogonal'))(lstm_2)

    truncated = lstm_3[:, :pred_len]


    x=TimeDistributed(Dense(1024,activation='relu'))(truncated) 
    x=Dropout(0.3)(x)

    x=TimeDistributed(Dense(512,activation='relu'))(x)
    x=Dropout(0.3)(x) 

    x=TimeDistributed(Dense(5,activation='linear'))(x)



    model = Model(inputs=inputs, outputs=x) 
                
    model.compile(optimizer = opt, loss = mcrmse, metrics=['accuracy'])
    return model

In [None]:
model=build_model()

In [None]:
plot_model(model)

In [None]:
def train_and_infer( STRATIFY=True, FOLDS=4, EPOCHS=50, BATCH_SIZE=64,
                    REPEATS=3, SEED=34, VERBOSE=2):

    #get test now for OOF 
    public_df = test.query("seq_length == 107").copy()
    private_df = test.query("seq_length == 130").copy()
    private_preds = np.zeros((private_df.shape[0], 130, 5))
    public_preds = np.zeros((public_df.shape[0], 107, 5))
    public_inputs = preprocess_inputs(public_df)
    private_inputs = preprocess_inputs(private_df)

    #to evaluate TTA effects/post processing
    holdouts = []
    holdout_preds = []
    
    #to view learning curves
    histories = []
    
    #put similar RNA in the same fold
    gkf = GroupKFold(n_splits=FOLDS)
    kf=KFold(n_splits=FOLDS, random_state=SEED)
    kmeans_model = KMeans(n_clusters=200, random_state=SEED).fit(preprocess_inputs(train)[:,:,0])
    train['cluster_id'] = kmeans_model.labels_

    for _ in range(REPEATS):
        
        for f, (train_index, val_index) in enumerate((gkf if STRATIFY else kf).split(train,
                train['reactivity'], train['cluster_id'] if STRATIFY else None)):

            #define training callbacks
            lr_callback = tf.keras.callbacks.ReduceLROnPlateau(patience=8, 
                                                               factor=.1,
                                                               #min_lr=1e-5,
                                                               verbose=VERBOSE)
            save = tf.keras.callbacks.ModelCheckpoint(f'model-{f}.h5')

            #define sample weight function
            epsilon = .1
            sample_weighting = np.log1p(train.iloc[train_index]['signal_to_noise'] + epsilon) / 2

            #get train data
            trn = train.iloc[train_index]
            trn_ = preprocess_inputs(trn)
            trn_labs = np.array(trn[targets].values.tolist()).transpose((0, 2, 1))

            #get validation data
            val = train.iloc[val_index]
            val_all = preprocess_inputs(val)
            val = val[val.SN_filter == 1]
            val_ = preprocess_inputs(val)
            val_labs = np.array(val[targets].values.tolist()).transpose((0, 2, 1))

            #pre-build models for different sequence lengths
            model = build_model()
            model_short = build_model(seq_len=107, pred_len=107)
            model_long = build_model(seq_len=130, pred_len=130)

            #train model
            history = model.fit(
                trn_, trn_labs,
                validation_data = (val_, val_labs),
                batch_size=BATCH_SIZE,
                epochs=EPOCHS,
                sample_weight=sample_weighting,
                callbacks=[save, lr_callback],
                verbose=VERBOSE
            )

            histories.append(history)

            #load best models
            model.load_weights(f'model-{f}.h5')
            model_short.load_weights(f'model-{f}.h5')
            model_long.load_weights(f'model-{f}.h5')

            holdouts.append(train.iloc[val_index])
            holdout_preds.append(model.predict(val_all))

            public_preds += model_short.predict(public_inputs) / (FOLDS * REPEATS)
            private_preds += model_long.predict(private_inputs) / (FOLDS * REPEATS)
        
        del model, model_short, model_long
        
    return holdouts, holdout_preds, public_df, public_preds, private_df, private_preds, histories


In [None]:
lstm_holdouts, lstm_holdout_preds, public_df, lstm_public_preds, private_df, lstm_private_preds, lstm_histories = train_and_infer()

In [None]:
def get_error(preds):
    val = pd.read_json('../input/stanford-covid-vaccine/train.json', lines=True)

    val_data = []
    for mol_id in val['id'].unique():
        sample_data = val.loc[val['id'] == mol_id]
        sample_seq_length = sample_data.seq_length.values[0]
        for i in range(68):
            sample_dict = {
                           'id_seqpos' : sample_data['id'].values[0] + '_' + str(i),
                           'reactivity_gt' : sample_data['reactivity'].values[0][i],
                           'deg_Mg_pH10_gt' : sample_data['deg_Mg_pH10'].values[0][i],
                           'deg_Mg_50C_gt' : sample_data['deg_Mg_50C'].values[0][i],
                           }
            
            val_data.append(sample_dict)
            
    val_data = pd.DataFrame(val_data)
    val_data = val_data.merge(preds, on='id_seqpos')

    rmses = []
    mses = []
    
    for col in ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']:
        rmse = ((val_data[col] - val_data[col+'_gt']) ** 2).mean() ** .5
        mse = ((val_data[col] - val_data[col+'_gt']) ** 2).mean()
        rmses.append(rmse)
        mses.append(mse)
        print(col, rmse, mse)
    print(np.mean(rmses), np.mean(mses))
    print('')

In [None]:
model.load_weights(f'model-0.h5')

In [None]:
def format_predictions(test_df, test_preds, val=False):
    preds = []
    
    for df, preds_ in zip(test_df, test_preds):
        for i, uid in enumerate(df['id']):
            single_pred = preds_[i]

            single_df = pd.DataFrame(single_pred, columns= targets)
            single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]
            if val: single_df['SN_filter'] = df[df['id'] == uid].SN_filter.values[0]

            preds.append(single_df)
    return  pd.concat(preds)

In [None]:
lstm_val_preds = format_predictions(lstm_holdouts, lstm_holdout_preds, val=True)

get_error(lstm_val_preds)

In [None]:
test_df = [public_df, private_df]
lstm_preds = [lstm_public_preds,lstm_private_preds]
lstm_preds = format_predictions(test_df, lstm_preds)


In [None]:
submission = sample_sub[['id_seqpos']].merge(lstm_preds, on=['id_seqpos'])
submission.head()

In [None]:
submission.to_csv(f'submission_new.csv', index=False)
print('Submission saved')