## User Preset

In [None]:
NFOLDS = 5
new_seq_path = r"/kaggle/input/stanford-covid-vaccine/post_deadline_files/"

In [None]:
import numpy as np 
import pandas as pd
import gc, os
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow.keras import layers as L
import tensorflow_addons as tfa
from tensorflow.keras import backend as K

## Helper Functions

In [None]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )
def np_onehot(x, num_class):
    x = x.reshape(-1).astype(np.int32)
    onehot = np.zeros((len(x), num_class))
    onehot[np.arange(len(x)), x] = 1
    return onehot

def mk_pair_map(structure, type='pm'):
    pm = np.full(len(structure), -1, dtype=int)
    pd = np.full(len(structure), -1, dtype=int)
    queue = []
    for i, s in enumerate(structure):
        if s == "(":
            queue.append(i)
        elif s == ")":
            j = queue.pop()
            pm[i] = j
            pm[j] = i
            pd[i] = i-j
            pd[j] = i-j
    if type == 'pm':
        return pm
    elif type == 'pd':
        return pd
    
def pair_type(seq, pair_map):
    pm_map = {
        "0":"0",
        "GC": "GC",
        "CG": "GC",
        "AU": "AU",
        "UA": "AU",
        "GU": "GU",
        "UG": "GU"
    }
    
    result = ["0" if pair_map[i] == -1 else seq[i]+seq[pair_map[i]] for i in range(len(pair_map))]
    result = [pm_map[r] for r in result]
    return result

def col_mcrmse_loss(y_true, y_pred):
    y_true_nan = tf.where(tf.math.is_nan(y_true), y_pred, y_true)
    rmse = tf.sqrt(tf.reduce_mean(tf.square(y_true_nan - y_pred), axis=1) + EPSILON)
    return tf.reduce_mean(rmse[:, 0]*0.2 + rmse[:, 1]*0.3 + rmse[:, 2]*0.3 + rmse[:, 3]*0.1 + rmse[:, 4]*0.1)
    
def reverse_2D(_input):
    return _input[:, ::-1, :]

def reverse_3D(_input):
    return _input[:, ::-1, ::-1,:]

## Load Data

In [None]:
# This will tell us the columns we are predicting
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']
error_cols = ['reactivity_error', 'deg_error_Mg_pH10', 'deg_error_Mg_50C', 'deg_error_pH10', 'deg_error_50C']

In [None]:
import json
import glob
from tqdm.notebook import tqdm

test  = pd.read_csv(os.path.join(new_seq_path, "new_sequences.csv"))
sub = pd.read_csv(os.path.join(new_seq_path, "new_sequences_submission.csv"))

test['seq_length'] = test['sequence'].apply(len)

In [None]:
### 1. As: bpp matrices

In [None]:
### 2. Structure Adjacency
def get_structure_adj(train):
    Ss = []
    Ssm1 = []
    Ssp1 = []
    for i in range(len(train)):
        seq_length = train["seq_length"].iloc[i]
        structure = train["structure"].iloc[i]
        sequence = train["sequence"].iloc[i]

        cue = []
        a_structures = {
            ("A", "U") : np.zeros([seq_length, seq_length]),
            ("C", "G") : np.zeros([seq_length, seq_length]),
            ("U", "G") : np.zeros([seq_length, seq_length]),
            ("U", "A") : np.zeros([seq_length, seq_length]),
            ("G", "C") : np.zeros([seq_length, seq_length]),
            ("G", "U") : np.zeros([seq_length, seq_length]),
        }
        
        am1_structures = {
            ("A", "U") : np.zeros([seq_length, seq_length]),
            ("C", "G") : np.zeros([seq_length, seq_length]),
            ("U", "G") : np.zeros([seq_length, seq_length]),
            ("U", "A") : np.zeros([seq_length, seq_length]),
            ("G", "C") : np.zeros([seq_length, seq_length]),
            ("G", "U") : np.zeros([seq_length, seq_length]),
        }
        
        ap1_structures = {
            ("A", "U") : np.zeros([seq_length, seq_length]),
            ("C", "G") : np.zeros([seq_length, seq_length]),
            ("U", "G") : np.zeros([seq_length, seq_length]),
            ("U", "A") : np.zeros([seq_length, seq_length]),
            ("G", "C") : np.zeros([seq_length, seq_length]),
            ("G", "U") : np.zeros([seq_length, seq_length]),
        }
        
        for i in range(seq_length):
            if structure[i] == "(":
                cue.append(i)
            elif structure[i] == ")":
                start = cue.pop()
                a_structures[(sequence[start], sequence[i])][start, i] = 1
                a_structures[(sequence[i], sequence[start])][i, start] = 1
                if i - 1 >=0:
                    am1_structures[(sequence[start], sequence[i])][start, i-1] = 1
                if start - 1 >= 0:
                    ap1_structures[(sequence[i], sequence[start])][i, start-1] = 1
                if i + 1 < len(sequence):
                    ap1_structures[(sequence[start], sequence[i])][start, i+1] = 1
                if start + 1 < len(sequence):
                    am1_structures[(sequence[i], sequence[start])][i, start+1] = 1
        
        a_strc = np.stack([a for a in a_structures.values()], axis = 2)
        a_strc = np.sum(a_strc, axis = 2, keepdims = True)
        am1_strc = np.stack([a for a in am1_structures.values()], axis = 2)
        am1_strc = np.sum(am1_strc, axis = 2, keepdims = True)
        ap1_strc = np.stack([a for a in ap1_structures.values()], axis = 2)
        ap1_strc = np.sum(ap1_strc, axis = 2, keepdims = True)
        Ss.append(a_strc)
        Ssm1.append(am1_strc)
        Ssp1.append(ap1_strc)
        
    Ss, Ssm1, Ssp1 = np.array(Ss), np.array(Ssm1), np.array(Ssp1)
    new = np.concatenate([Ss, Ssm1, Ssp1], axis=3)
    return new

In [None]:
### 3. distance matrix: 
def get_distance_matrix(As):
    idx = np.arange(As.shape[1])
    Ds = []
    for i in range(len(idx)):
        d = np.abs(idx[i] - idx)
        Ds.append(d)

    Ds = np.array(Ds) + 1
    Ds = 1/Ds
    Ds = Ds[None, :,:]
    Ds = np.repeat(Ds, len(As), axis = 0)

    Dss = []
    for i in [1, 2, 4]:
        Dss.append(Ds ** i)
    Ds = np.stack(Dss, axis = 3)
    return Ds

In [None]:
## 4. Node Features
def return_ohe(n, i):
    tmp = [0] * n
    tmp[i] = 1
    return tmp

def get_input(train):
    ## get node features, which is one hot encoded
    mapping = {}
    vocab = ["A", "G", "C", "U"]
    for i, s in enumerate(vocab):
        mapping[s] = return_ohe(len(vocab), i)
    X_node = np.stack(train["sequence"].apply(lambda x : list(map(lambda y : mapping[y], list(x)))))

    mapping = {}
    vocab = ["S", "M", "I", "B", "H", "E", "X"]
    for i, s in enumerate(vocab):
        mapping[s] = return_ohe(len(vocab), i)
    X_loop = np.stack(train["bpRNA_string"].apply(lambda x : list(map(lambda y : mapping[y], list(x)))))
    
    mapping = {}
    vocab = [".", "(", ")"]
    for i, s in enumerate(vocab):
        mapping[s] = return_ohe(len(vocab), i)
    X_structure = np.stack(train["structure"].apply(lambda x : list(map(lambda y : mapping[y], list(x)))))
    
    X_node = np.concatenate([X_node, X_loop], axis = 2)
    
    ## interaction
    a = np.sum(X_node * (2 ** np.arange(X_node.shape[2])[None, None, :]), axis = 2)
    vocab = [17, 18, 20, 24, 33, 34, 36, 40, 65, 66, 68, 72, 129, 130, 132, 136, 257, 258, 260, 264, 513, 514, 516, 520, 1025, 1026, 1028, 1032]
    ohes = []
    for v in vocab:
        ohes.append(a == v)
    ohes = np.stack(ohes, axis = 2)
    X_node = np.concatenate([X_node, ohes], axis = 2).astype(np.float32)
    
    return X_node


pair_dict  = {x:i for i, x in enumerate(['0','AU','GU','GC'])} #4

def preprocess_inputs_wa(df):
    df['pair_map'] = df['structure'].apply(mk_pair_map)
    df["pair_type"] = df.apply(lambda x: pair_type(x["sequence"],x["pair_map"]), axis=1)
    v_pair_type = df['pair_type'].map(lambda seq: [pair_dict[x] for x in seq]).values
    
    bbp_max = []
    bbp_sum = []
    bpp_2ndmax_diff = []
    
    rna_id = df.id.values
    for i in rna_id:
        probability = np.load(os.path.join(new_seq_path, f"new_sequences_bpps/{i}.npy"))
        bbp_max.append(probability.max(-1).tolist())
        bbp_sum.append((1-probability.sum(-1)).tolist())
        temp = np.sort(probability)
        bpp_2ndmax_diff.append((temp[:,-1] - temp[:, -2]).tolist())
        
    inputs = []
    for i in range(len(df)):
        inputs.append(np.concatenate([
            np_onehot(np.array(v_pair_type[i]), 4)[:,1:],
            np.array(bbp_max[i]).reshape(-1, 1),
            np.array(bbp_sum[i]).reshape(-1, 1),
            np.array(bpp_2ndmax_diff[i]).reshape(-1, 1),
        ],1))
    return np.array(inputs)

In [None]:
# https://www.kaggle.com/nullrecurrent/ov-inference-233-new-seq

def get_features(df):
    X_node = get_input(df).astype(np.float32)
    X_node_hm = preprocess_inputs_wa(df).astype(np.float32)
    X_node = np.concatenate([X_node, X_node_hm], axis=2)
    
    As = []
    for id in df["id"]:
        a = np.load(os.path.join(new_seq_path, f"new_sequences_bpps/{id}.npy"))
        As.append(a)
    As = np.array(As)
    Ss = get_structure_adj(df).astype(np.float32)
    Ds = get_distance_matrix(As)
    As = np.concatenate([As[:,:,:,None], Ss, Ds], axis = 3).astype(np.float32)
    return X_node, As

dict_X = {}
dict_A = {}
for i in tqdm(test.id):
    df_temp = test.loc[[i]]
    dict_X[i], dict_A[i] = get_features(df_temp)
    
X_node, As = dict_X[0], dict_A[0]

## Model

In [None]:
# GRU/LSTM layer from https://www.kaggle.com/xhlulu/openvaccine-simple-gru-model
# Wave block from https://www.kaggle.com/ragnar123/wavenet-gru-baseline

def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(
        L.GRU(hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal')
    )


def lstm_layer(hidden_dim, dropout):
    return L.Bidirectional(
              L.LSTM(hidden_dim,dropout=dropout, return_sequences=True,kernel_initializer = 'orthogonal'))

def wave_block(x, filters, kernel_size, n):
    dilation_rates = [2 ** i for i in range(n)]
    x = tf.keras.layers.Conv1D(filters = filters, 
                               kernel_size = 1,
                               padding = 'same')(x)
    res_x = x
    for dilation_rate in dilation_rates:
        tanh_out = tf.keras.layers.Conv1D(filters = filters,
                          kernel_size = kernel_size,
                          padding = 'same', 
                          activation = 'tanh', 
                          dilation_rate = dilation_rate)(x)
        sigm_out = tf.keras.layers.Conv1D(filters = filters,
                          kernel_size = kernel_size,
                          padding = 'same',
                          activation = 'sigmoid', 
                          dilation_rate = dilation_rate)(x)
        x = tf.keras.layers.Multiply()([tanh_out, sigm_out])
        x = tf.keras.layers.Conv1D(filters = filters,
                   kernel_size = 1,
                   padding = 'same')(x)
        res_x = tf.keras.layers.Add()([res_x, x])
    return res_x

In [None]:
def attention(x_inner, x_outer, n_factor, dropout):
    x_Q =  L.Conv1D(n_factor, 1, activation='linear', 
                  kernel_initializer='glorot_uniform',
                  bias_initializer='glorot_uniform',
                 )(x_inner)
    x_K =  L.Conv1D(n_factor, 1, activation='linear', 
                  kernel_initializer='glorot_uniform',
                  bias_initializer='glorot_uniform',
                 )(x_outer)
    x_V =  L.Conv1D(n_factor, 1, activation='linear', 
                  kernel_initializer='glorot_uniform',
                  bias_initializer='glorot_uniform',
                 )(x_outer)
    x_KT = L.Permute((2, 1))(x_K)
    res = L.Lambda(lambda c: K.batch_dot(c[0], c[1]) / np.sqrt(n_factor))([x_Q, x_KT])
    att = L.Lambda(lambda c: K.softmax(c, axis=-1))(res)
    att = L.Lambda(lambda c: K.batch_dot(c[0], c[1]))([att, x_V])
    return att

def multi_head_attention(x, y, n_factor, n_head, dropout):
    if n_head == 1:
        att = attention(x, y, n_factor, dropout)
    else:
        n_factor_head = n_factor // n_head
        heads = [attention(x, y, n_factor_head, dropout) for i in range(n_head)]
        att = L.Concatenate()(heads)
        att = L.Dense(n_factor, 
                      kernel_initializer='glorot_uniform',
                      bias_initializer='glorot_uniform',
                     )(att)
    x = L.Add()([x, att])
    x = L.LayerNormalization()(x)
    if dropout > 0:
        x = L.Dropout(dropout)(x)
    return x

def res(x, unit, kernel = 3, rate = 0.1):
    h = L.Conv1D(unit, kernel, 1, padding = "same", activation = None)(x)
    h = L.LayerNormalization()(h)
    h = L.LeakyReLU()(h)
    h = L.Dropout(rate)(h)
    return L.Add()([x, h])

def forward(x, unit, kernel = 3, rate = 0.1):
#     h = L.Dense(unit, None)(x)
    h = L.Conv1D(unit, kernel, 1, padding = "same", activation = None)(x)
    h = L.LayerNormalization()(h)
    h = L.Dropout(rate)(h)
#         h = tf.keras.activations.swish(h)
    h = L.LeakyReLU()(h)
    h = res(h, unit, kernel, rate)
    return h

def adj_attn(x, adj, unit, n = 2, rate = 0.1):
    x_a = x
    x_as = []
    for i in range(n):
        x_a = forward(x_a, unit)
        x_a = tf.matmul(adj, x_a)
        x_as.append(x_a)
    if n == 1:
        x_a = x_as[0]
    else:
        x_a = L.Concatenate()(x_as)
    x_a = forward(x_a, unit)
    return x_a


def get_base(config, dim=None):
    node = tf.keras.Input(shape = (dim, X_node.shape[2]), name = "node")
    adj = tf.keras.Input(shape = (dim, dim, As.shape[3]), name = "adj")
    
    adj_learned = L.Dense(1, "relu")(adj)
    adj_all = L.Concatenate(axis = 3)([adj, adj_learned])
        
    xs = []
    xs.append(node)
    x1 = forward(node, 128, kernel = 3, rate = 0.0)
    x2 = forward(x1, 64, kernel = 6, rate = 0.0)
    x3 = forward(x2, 32, kernel = 15, rate = 0.1)
    x4 = forward(x3, 16, kernel = 30, rate = 0.1)
    x = L.Concatenate()([x1, x2, x3, x4])
    
    for unit in [64, 32]:
        x_as = []
        for i in range(adj_all.shape[3]):
            x_a = adj_attn(x, adj_all[:, :, :, i], unit, rate = 0.0)
            x_as.append(x_a)
        x_c = forward(x, unit, kernel = 30)
        
        x = L.Concatenate()(x_as + [x_c])
        x = forward(x, unit)
        x = multi_head_attention(x, x, unit, 4, 0.0)
        xs.append(x)
        
    x = L.Concatenate()(xs)

    model = tf.keras.Model(inputs = [node, adj], outputs = [x])
    return model


def get_ae_model(base, config, dim=None):
    node = tf.keras.Input(shape = (dim, X_node.shape[2]), name = "node")
    adj = tf.keras.Input(shape = (dim, dim, As.shape[3]), name = "adj")

    x = base([L.SpatialDropout1D(0.2)(node), adj])
    x = forward(x, 64, rate = 0.2)
    p = L.Dense(X_node.shape[2], "sigmoid")(x)
    
    loss = - tf.reduce_mean(20 * node * tf.math.log(p + 1e-4) + (1 - node) * tf.math.log(1 - p + 1e-4))
    model = tf.keras.Model(inputs = [node, adj], outputs = [loss])
    
    opt = get_optimizer()
    model.compile(optimizer = opt, loss = lambda t, y : y)
    return model


def get_model(base, config, dim=None):
    node = tf.keras.Input(shape = (dim, X_node.shape[2]), name = "node")
    adj = tf.keras.Input(shape = (dim, dim, As.shape[3]), name = "adj")
    
    x = base([node, adj])
    x = forward(x, 128, rate = 0.2)
    
    if ADDITIONAL_LAYER == 'grux2':
        x = gru_layer(128, dropout=0.2)(x)
        x = gru_layer(128, dropout=0.2)(x)
        
    elif ADDITIONAL_LAYER == 'lstmx1':
        x = lstm_layer(128, dropout=0.2)(x)
        
    elif ADDITIONAL_LAYER == 'lstmx2':
        x = lstm_layer(128, dropout=0.2)(x)
        x = lstm_layer(128, dropout=0.2)(x)
        
    elif ADDITIONAL_LAYER == 'wave':
        dropout = 0.2
        x = wave_block(x, 16, 3, 12)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout)(x)

        x = wave_block(x, 32, 3, 8)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout)(x)

        x = wave_block(x, 64, 3, 4)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout)(x)

        x = wave_block(x, 128, 3, 1)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout)(x)
        
    elif ADDITIONAL_LAYER == 'wave_small':
        dropout = 0.1
        x = wave_block(x, 32, 3, 8)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout)(x)

        x = wave_block(x, 64, 3, 4)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout)(x)

        x = wave_block(x, 128, 3, 1)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dropout(dropout)(x)
    x = L.Dense(5)(x)

    model = tf.keras.Model(inputs = [node, adj], outputs = [x])
    
    opt = get_optimizer()
    model.compile(optimizer = opt, loss = col_mcrmse_loss)
    return model

def get_optimizer():
    adam = tf.optimizers.Adam()
    return adam

## Inference

In [None]:
config = {}
model_folders = ['gru_3d_tta_denoise', 'lstm_3d_tta', 'lstm_3d_tta_denoise']
additional_layers = ['grux2', 'lstmx2', 'lstmx2']
prediction_list = []

for model_folder, ADDITIONAL_LAYER in zip(model_folders, additional_layers):
    print("------------------------------")
    print(model_folder, ADDITIONAL_LAYER)
    print("------------------------------")
    
    base = get_base(config)
    base.load_weights(f"/kaggle/input/openvaccinemodelweights/{model_folder}/base_ae")
    model = get_model(base, config)
        
    for fold_ in range(NFOLDS):  
        model.load_weights(f"/kaggle/input/openvaccinemodelweights/{model_folder}/model_{fold_}.h5")
        preds_ls = []
        for rna_id in tqdm(test.id):
            X_node, As = dict_X[rna_id], dict_A[rna_id]
            single_pred = ((model.predict([X_node, As]) + reverse_2D(model.predict([reverse_2D(X_node), reverse_3D(As)])))/2)[0]
            single_df = pd.DataFrame(single_pred, columns=pred_cols)
            single_df['id_seqpos'] = [f'{rna_id}_{x}' for x in range(single_df.shape[0])]
            preds_ls.append(single_df)
            del single_pred, single_df
            gc.collect()
            
        preds_df = pd.concat(preds_ls).set_index('id_seqpos')
        preds_df.to_csv(f"pred_{model_folder}_{fold_}.csv")
        prediction_list.append(preds_df)
        del preds_df, preds_ls
        gc.collect()
    del base, model
    gc.collect()
    K.clear_session()

In [None]:
### Not capping any downside or upside, but in competition I have a downside cap at -0.5
N_PREDS = NFOLDS * len(model_folders)

final_prediction = prediction_list[0].copy()/N_PREDS
for i in range(1, N_PREDS):
    final_prediction = final_prediction.add(prediction_list[i]/N_PREDS)
final_prediction = final_prediction.reset_index()

In [None]:
submission = sub[['id_seqpos']].merge(final_prediction, on=['id_seqpos'])
submission.to_csv('submission.csv', index=False)