In [None]:
# Importing basic libraries
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

from numpy import zeros, newaxis
from pathlib import Path
from collections import OrderedDict

# fix random seed for reproducibility
SEED = 101
np.random.seed(SEED)
tf.random.set_seed(SEED)
               
# Importing the training set
DATA_DIR = Path("../input/stanford-covid-vaccine/")
BPPS_DIR = DATA_DIR / "bpps"

train = pd.read_json(DATA_DIR / "train.json", lines=True)
test = pd.read_json(DATA_DIR / "test.json", lines=True)
aug_df = pd.read_csv('/kaggle/input/how-to-generate-augmentation-data/aug_data.csv')

bppm_paths = list(BPPS_DIR.glob("*.npy"))

# settings
debug = False
TPU = False

In [None]:
# Light data exploration, to check features lengths
len(train['sequence'][0]), len(train['structure'][0]), len(train['predicted_loop_type'][0])

In [None]:
# Examine features
train['sequence'][50], train['structure'][50] ,train['predicted_loop_type'][50]

In [None]:
# Data Processing
# Set alphabets
alphabet = 'AGCU().MXBISHE'
alphabet_rna = 'AGCU'
alphabet_struc = '()XXX.'
alphabet_loop = 'MXB...I...S.H.E'

# Set target_cols
target_cols  = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
non_target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']
marked_target_cols  = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']

def get_bppm(id_):
    return np.load(BPPS_DIR / f"{id_}.npy")

def get_bpps_nb(id_):
     # from https://www.kaggle.com/symyksr/openvaccine-deepergcn 
    bpps_nb_mean = 0.077522 # mean of bpps_nb across all training data
    bpps_nb_std = 0.08914   # std of bpps_nb across all training data
    bpps = get_bppm(id_)
    bpps_nb = (bpps > 0).sum(axis=0) / bpps.shape[0]
    bpps_nb = (bpps_nb - bpps_nb_mean) / bpps_nb_std
    return bpps_nb

def mk_pair_map(structure, type='pm'):
    pm = np.full(len(structure), 0, dtype=int)
    pd = np.full(len(structure), 0, dtype=int)
    queue = []
    for i, s in enumerate(structure):
        if s == "(":
            queue.append(i)
        elif s == ")":
            j = queue.pop()
            pm[i] = j
            pm[j] = i
            pd[i] = i-j
            pd[j] = i-j
    if type == 'pm':
        return pm
    elif type == 'pd':
        return pd

def get_structure_adj(seq_length, structure, sequence):
    Ss = []
    cue = []
    a_structures = OrderedDict([
        (("A", "U"), np.zeros([seq_length, seq_length])),
        (("C", "G"), np.zeros([seq_length, seq_length])),
        (("U", "G"), np.zeros([seq_length, seq_length])),
        (("U", "A"), np.zeros([seq_length, seq_length])),
        (("G", "C"), np.zeros([seq_length, seq_length])),
        (("G", "U"), np.zeros([seq_length, seq_length])),
    ])
    for j in range(seq_length):
        if structure[j] == "(":
            cue.append(j)
        elif structure[j] == ")":
            start = cue.pop()
            a_structures[(sequence[start], sequence[j])][start, j] = 1
            a_structures[(sequence[j], sequence[start])][j, start] = 1

    a_strc = np.stack([a for a in a_structures.values()], axis=2)
    a_strc = np.sum(a_strc, axis=2, keepdims=False)
    return a_strc
    
def preprocess_data(data):
    data = data.loc[data['SN_filter'] == 1].copy()
    data = data.reset_index(drop=True)
    return data

def step(seq_length):
    data = list(range(int(seq_length)))
    newList = []
    newList = [x / seq_length for x in data]
    return newList

def preprocess_features(data):
    data['seq'] = data.apply(lambda x: integer_encoder(x['sequence'], alphabet_rna), axis=1)
    data['struc'] = data.apply(lambda x: integer_encoder(x['structure'], alphabet_struc), axis=1)
    data['loop'] = data.apply(lambda x: integer_encoder(x['predicted_loop_type'], alphabet_loop), axis=1)
    data['step'] = data.apply(lambda x: step(x['seq_length']), axis=1) # Doesn't help, not used.
    data['pair_dist'] = data.structure.apply(mk_pair_map, type='pd') #Not used.
    data['pair_map'] = data.structure.apply(mk_pair_map, type='pm')
    data['bppm_max'] = data.apply(lambda x: get_bppm(x['id']).max(0), axis=1)
    data['bppm_sum'] = data.apply(lambda x: get_bppm(x['id']).sum(0), axis=1)
    data['bppm_nb'] = data.apply(lambda x: get_bpps_nb(x['id']), axis=1)
    data['adj_struc'] = data.apply(lambda x: get_structure_adj(x['seq_length'], x['structure'], x['sequence']).sum(0), axis=1)
    a = np.array(data['seq'].values.tolist())[:,:,newaxis]
    b = np.array(data['struc'].values.tolist())[:,:,newaxis]
    c = np.array(data['loop'].values.tolist())[:,:,newaxis]
    d = np.array(data['adj_struc'].values.tolist())[:,:,newaxis]
    f = np.array(data['bppm_max'].values.tolist())[:,:,newaxis]
    g = np.array(data['bppm_sum'].values.tolist())[:,:,newaxis]
    h = np.array(data['bppm_nb'].values.tolist())[:,:,newaxis] 
    features_all=np.concatenate((a,b,c,d,f,g,h), axis = 2) 
    return features_all

def preprocess_labels(data):
    labels = data[target_cols].copy()
    return np.array(labels.values.tolist())

def integer_encoder(my_string, alphabet):
    data = my_string.lower()
    alphabet = alphabet.lower()
    char_to_int = dict((c, i) for i, c in enumerate(alphabet))
    
    # integer encode input data
    integer_encoded = [char_to_int[char] for char in data]
        
    return np.array(integer_encoded)

In [None]:
def aug_data(df):
    target_df = df.copy()
    new_df = aug_df[aug_df['id'].isin(target_df['id'])]
                         
    del target_df['structure']
    del target_df['predicted_loop_type']
    new_df = new_df.merge(target_df, on=['id','sequence'], how='left')

    df['cnt'] = df['id'].map(new_df[['id','cnt']].set_index('id').to_dict()['cnt'])
    df['log_gamma'] = 100
    df['score'] = 1.0
    df = df.append(new_df[df.columns])
    return df
train = aug_data(train)
test = aug_data(test)

In [None]:
# Prepare features
train_features = preprocess_features(preprocess_data(train))

# Prepare labels data
train_labels = preprocess_labels(preprocess_data(train)).transpose(0,2,1)

In [None]:
train_features.shape, train_labels.shape

In [None]:
def plot_structures(features: np.ndarray, labels: np.ndarray):
    fig, axes = plt.subplots(1, 2, figsize=(20, 10))
    axes[0].imshow(features.T)
    axes[0].set_title("Features")
    axes[1].imshow(labels.T)
    axes[1].set_title("Labels")
    plt.show()

In [None]:
# Examine features
train['sequence'][2], train['structure'][2] ,train['predicted_loop_type'][2]

In [None]:
feature = train_features[2:3,:,:]
feature = feature[0,:,:]
labels = train_labels[2:3,:,:]
labels = labels[0,:,:]

plot_structures(feature, labels)

In [None]:
# Importing ML libraries
import keras
import keras.backend as K
import tensorflow as tf

import tensorflow.keras.layers as L
from keras.models import Sequential
from keras.layers import Dropout, Embedding, LSTM, Dense, Bidirectional, Activation, Flatten, GRU
from keras.layers import BatchNormalization, SpatialDropout1D, InputLayer, Reshape, Lambda
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import SGD, Adam, Adadelta, RMSprop
from keras.layers.convolutional import Convolution1D, MaxPooling1D

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.cluster import KMeans

In [None]:
# Loss function
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [None]:
if TPU:
    # detect and init the TPU
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    
    # instantiate a distribution strategy
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [None]:
# build model functions
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer = 'orthogonal'))

def lstm_layer(hidden_dim, dropout):
    return L.Bidirectional(L.LSTM(hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer = 'orthogonal'))

def conv1d_layer(filters, kernel_size):
    return L.Conv1D(filters=filters, kernel_size=kernel_size, padding='valid')

def build_model(seq_len=107, pred_len=68, dropout1=0.0, dropout2=0.3, embed_dim=230, hidden_dim1=220, hidden_dim2=330, 
                type=0, filters=255, kernel_size=5):

    lr = 0.0010953574938066576
    
    inputs = L.Input(shape=(seq_len, train_features.shape[2]))
   
    # split integer and float features and concatenate them later.
    integer_fea_seq = inputs[:, :, :1]
    embed_seq = L.Embedding(input_dim=len(alphabet_rna)+1, output_dim=embed_dim)(integer_fea_seq)
    reshaped_seq = tf.reshape(embed_seq, shape=(-1, embed_seq.shape[1],  embed_seq.shape[2] * embed_seq.shape[3]))

    integer_fea_struc = inputs[:, :, 1:2]
    embed_struc = L.Embedding(input_dim=len(alphabet_struc)+1, output_dim=embed_dim)(integer_fea_struc)
    reshaped_struc = tf.reshape(embed_struc, shape=(-1, embed_struc.shape[1],  embed_struc.shape[2] * embed_struc.shape[3]))
    
    integer_fea_loop = inputs[:, :, 2:3]
    embed_loop = L.Embedding(input_dim=len(alphabet_loop)+1, output_dim=embed_dim)(integer_fea_loop)
    reshaped_loop = tf.reshape(embed_loop, shape=(-1, embed_loop.shape[1],  embed_loop.shape[2] * embed_loop.shape[3]))
    
    float_fea = inputs[:, :, 3:]
    concat = L.concatenate([reshaped_seq, reshaped_struc, reshaped_loop, float_fea], axis=2)
    
    if type == 0:
        hidden = lstm_layer(hidden_dim1, dropout1)(concat)
        hidden = gru_layer(hidden_dim2, dropout2)(hidden)
    elif type == 1:
        hidden = gru_layer(hidden_dim1, dropout1)(concat)
        hidden = gru_layer(hidden_dim2, dropout2)(hidden)    
    elif type == 2:
        hidden = gru_layer(hidden_dim1, dropout1)(concat)
        hidden = lstm_layer(hidden_dim2, dropout2)(hidden)
    elif type == 3:
        hidden = lstm_layer(hidden_dim1, dropout1)(concat)
        hidden = lstm_layer(hidden_dim2, dropout2)(hidden)
    
    truncated = hidden[:, :pred_len]
    out = L.Dense(5, activation='linear')(truncated)
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=lr), loss=MCRMSE)

    return model

In [None]:
if debug:
    train = train[:20]
    test = test[:20]

In [None]:
model = build_model()
model.summary()

In [None]:
if debug:
    n_clusters = 20
else:
    n_clusters = 200

kmeans_model = KMeans(n_clusters=n_clusters, random_state=110).fit(preprocess_features(train)[:,:,2]) #clustering with loop
train['cluster_id'] = kmeans_model.labels_

In [None]:
batch_size = 64
    
def train_and_predict(type = 0, FOLD_N = 5, Ver=1):
    
    if debug:
        FOLD_N = 2
    
    gkf = GroupKFold(n_splits=FOLD_N)

    test_107 = test.query("seq_length == 107").copy()
    test_130  = test.query("seq_length == 130").copy()
    
    inputs_107 = preprocess_features(test_107)
    inputs_130 = preprocess_features(test_130)
    
    holdouts = []
    holdout_preds = []

    for cv, (train_index, test_index) in enumerate(gkf.split(train,  train['deg_Mg_pH10'], train['cluster_id'])):
        
        trn = train.iloc[train_index].copy()
        X_train = preprocess_features(trn)
        y_train = np.array(trn[target_cols].values.tolist()).transpose((0, 2, 1))

        val = train.iloc[test_index].copy()
        x_val_all = preprocess_features(val)
        val = val[val.SN_filter == 1]
        X_test = preprocess_features(val)
        y_test = np.array(val[target_cols].values.tolist()).transpose((0, 2, 1))
        sample_weight = np.log(trn.signal_to_noise+1.1)*2 
        
        if TPU:
            with tpu_strategy.scope():
                model = build_model(type=type)
                model_107 = build_model(seq_len=107, pred_len=107,type=type)
                model_130 = build_model(seq_len=130, pred_len=130,type=type)
        else:
            model = build_model(type=type)
            model_107 = build_model(seq_len=107, pred_len=107,type=type)
            model_130 = build_model(seq_len=130, pred_len=130,type=type)
        
        history = model.fit(
            X_train, y_train,
            validation_data = (X_test, y_test),
            batch_size=batch_size,
            epochs=105,
            sample_weight=sample_weight,
            callbacks=[
                tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=7, restore_best_weights=True),
                tf.keras.callbacks.ReduceLROnPlateau(),
                tf.keras.callbacks.ModelCheckpoint(f'model{Ver}_cv{cv}.h5', save_weights_only=True)
            ]
        )
        
        model.load_weights(f'model{Ver}_cv{cv}.h5')
        model_107.load_weights(f'model{Ver}_cv{cv}.h5')
        model_130.load_weights(f'model{Ver}_cv{cv}.h5')
        
        holdouts.append(train.iloc[test_index].copy())
        holdout_preds.append(model.predict(x_val_all))
        
        if cv == 0:
            preds_107 = model_107.predict(inputs_107)/FOLD_N
            preds_130 = model_130.predict(inputs_130)/FOLD_N
        else:
            preds_107 += model_107.predict(inputs_107)/FOLD_N
            preds_130 += model_130.predict(inputs_130)/FOLD_N
    return holdouts, holdout_preds, test_107, preds_107, test_130, preds_130


In [None]:
val_df, val_preds, test_df, test_preds = [], [], [], []

n_model = 4

for i in range(n_model):
    holdouts, holdout_preds, test_107, preds_107, test_130, preds_130 = train_and_predict(i)
    val_df += holdouts
    val_preds += holdout_preds
    test_df.append(test_107)
    test_df.append(test_130)
    test_preds.append(preds_107)
    test_preds.append(preds_130)

In [None]:
preds_ls = []
for df, preds in zip(test_df, test_preds):
    for i, uid in enumerate(df.id):
        single_pred = preds[i]
        single_df = pd.DataFrame(single_pred, columns=target_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]
        preds_ls.append(single_df)
preds_df = pd.concat(preds_ls).groupby('id_seqpos').mean().reset_index()
# .mean() is for
# 1, Predictions from multiple models
# 2, TTA (augmented test data)

preds_ls = []
for df, preds in zip(val_df, val_preds):
    for i, uid in enumerate(df.id):
        single_pred = preds[i]
        single_df = pd.DataFrame(single_pred, columns=target_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]
        single_df['SN_filter'] = df[df['id'] == uid].SN_filter.values[0]
        preds_ls.append(single_df)
holdouts_df = pd.concat(preds_ls).groupby('id_seqpos').mean().reset_index()

In [None]:
submission = preds_df[['id_seqpos', 'reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']]
submission.to_csv(f'submission.csv', index=False)
print(f'wrote to submission.csv')

In [None]:
def print_mse(prd):
    val = pd.read_json('../input/stanford-covid-vaccine/train.json', lines=True)

    val_data = []
    for mol_id in val['id'].unique():
        sample_data = val.loc[val['id'] == mol_id]
        sample_seq_length = sample_data.seq_length.values[0]
        for i in range(68):
            sample_dict = {
                           'id_seqpos' : sample_data['id'].values[0] + '_' + str(i),
                           'reactivity_gt' : sample_data['reactivity'].values[0][i],
                           'deg_Mg_pH10_gt' : sample_data['deg_Mg_pH10'].values[0][i],
                           'deg_Mg_50C_gt' : sample_data['deg_Mg_50C'].values[0][i],
                           }
            val_data.append(sample_dict)
    val_data = pd.DataFrame(val_data)
    val_data = val_data.merge(prd, on='id_seqpos')

    rmses = []
    mses = []
    for col in ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']:
        rmse = ((val_data[col] - val_data[col+'_gt']) ** 2).mean() ** .5
        mse = ((val_data[col] - val_data[col+'_gt']) ** 2).mean()
        rmses.append(rmse)
        mses.append(mse)
        print(col, rmse, mse)
    print(np.mean(rmses), np.mean(mses))

In [None]:
print_mse(holdouts_df)

In [None]:
print_mse(holdouts_df[holdouts_df.SN_filter == 1])