In [None]:
import pandas as pd
import numpy as np

train = pd.read_json('../input/stanford-covid-vaccine/train.json', lines=True)
test = pd.read_json('../input/stanford-covid-vaccine/test.json', lines=True)

print('train shapes: ', train.shape)
print('test shapes: ', test.shape)

In [None]:
train.info()

In [None]:
sample = train.iloc[0]
sample

In [None]:
def partners(sequence, structure):
    # this function takes an RNA sequence (string) and a sturcture sequence (string)
    # and returns an RNA sequence (string) indicating the paired base in each position if any and X if not paired
    partner_index = [-1 for i in np.arange(len(sequence))]
    res = ''
    queue = []
    for i in np.arange(len(structure)):
        if structure[i] == '(':
            queue.append(i)
        if structure[i] == ')':
            first = queue.pop()
            partner_index[first] = i
            partner_index[i] = first
    for i in np.arange(len(structure)):
        if partner_index[i] == -1:
            res += 'X'
        else:
            res += sequence[partner_index[i]]
    return res        

In [None]:
def distance_to_loop(sequence, structure):
    """
    This function calculates the distance from the nearest loop.
    If all nucleotides are unstable, returns -1.
    """
    return [
        min([abs(i - j) for (j, cand) in enumerate(list(structure)) if cand != "."] or [len(sequence)])
        for (i, _) in enumerate(list(sequence))
    ]

In [None]:
def transform_features( sample , train = True):        
    # This function takes a sample RNA (a row of our training dataset) and returns a dataframe.
    # The rows of the dataframe are indexed by the positions in the RNA sequence.
    # There are 4 + 5 + 7 = 16 columns given by one-hot encoding to the following three questions: 
    # 1) whether the base is of ACGU type, 
    # 2) whether the base is paired with a base of ACGU type or no pair X,
    # 3) whether the base is of loop type BEHISXM.
    # There is one column 'id_seqpos' indicating the RNA sample id and the position in the RNA.
    if train:
        data = pd.DataFrame({'base' : [char for char in sample['sequence'][:sample['seq_scored']]],
                             'pair' : [char for char in partners(sample['sequence'][:sample['seq_scored']],sample['structure'][:sample['seq_scored']])],
                             'loop' : [char for char in sample['predicted_loop_type'][:sample['seq_scored']]],
                             'distance_to_loop' : distance_to_loop(sample['sequence'][:sample['seq_scored']],sample['structure'][:sample['seq_scored']])
                            })
        data['id_seqpos'] =  [sample['id'] + '_' + str(i) for i in np.arange(sample['seq_scored'])]
    else:
        data = pd.DataFrame({'base' : [char for char in sample['sequence']],
                             'pair' : [char for char in partners(sample['sequence'],sample['structure'])],
                             'loop' : [char for char in sample['predicted_loop_type']],
                             'distance_to_loop' : distance_to_loop(sample['sequence'],sample['structure'])
                            })
        data['id_seqpos'] =  [sample['id'] + '_' + str(i) for i in np.arange(sample['seq_length'])]
    return pd.get_dummies(data, columns = ['base','pair','loop'])

In [None]:
transform_features(sample)

In [None]:
def transform_targets( sample ):
    data = pd.DataFrame({
        'reactivity' : sample['reactivity'],
        'deg_pH10' : sample['deg_pH10'],
        'deg_50C' : sample['deg_50C'],
        'deg_Mg_pH10' : sample['deg_Mg_pH10'],
        'deg_Mg_50C' : sample['deg_Mg_50C']
    })
    return data

In [None]:
def SN_ratio( sample ):
    return pd.Series(np.divide(sample['reactivity'],sample['reactivity_error']))

In [None]:
train_data = []
for index, row in train.iterrows():
    sample_data = pd.concat([transform_targets(row),transform_features(row)], axis = 1)
    sample_data = sample_data[SN_ratio(row) > 5]
    train_data.append(sample_data)
train_data = pd.concat(train_data, ignore_index = True, axis = 0, copy = False).fillna(0)
train_data.head()

In [None]:
targets = pd.Series(['reactivity', 'deg_pH10', 'deg_50C', 'deg_Mg_pH10', 'deg_Mg_50C'],dtype = object)
features = train_data.columns.drop(targets).drop('id_seqpos')
features

In [None]:
test_data = []
for index, row in test.iterrows():
    test_data.append(transform_features(row, train = False))
test_data = pd.concat(test_data, ignore_index = True, axis = 0, copy = False).fillna(0)

In [None]:
test_data.tail()

In [None]:
train_target = train_data[targets].copy()
train_target

In [None]:
train_set = train_data[features].copy()
train_set

In [None]:
test_set = test_data[features].copy()
test_set

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import KFold

In [None]:
def MCRMSE(y_true, y_pred):
    colwise_mse = K.mean(K.square(y_true - y_pred))
    return K.mean(K.sqrt(colwise_mse))

In [None]:
def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Input(features.size),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(500, activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.6),
        tf.keras.layers.Dense(50, activation="relu"),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(targets.size, activation="elu")
    ])
    model.compile(optimizer='adam', loss=MCRMSE)
    return model

In [None]:
preds_df = pd.DataFrame()
preds_df['id'] = test_data['id_seqpos']
preds_df.loc[:, targets] = 0
res = train_target.copy()
for n, (tr, te) in enumerate(KFold(n_splits=10, random_state=666, shuffle=True).split(train_target)):
    print(f'Fold {n}')
    
    model = create_model()
    
    model.fit(
        train_set.values[tr],
        train_target.values[tr],
        epochs=45, 
        batch_size=64
    )
    
    preds_df.loc[:, targets] += model.predict(test_set)
    res.loc[te, targets] = model.predict(train_set.values[te])
    
preds_df.loc[:, targets] /= (n+1)

In [None]:
from sklearn.metrics import mean_squared_error as mse
import math

def rmse(y_true, y_pred):
    return math.sqrt(mse(y_true, y_pred)) / 3
metrics = []

for _target in train_target.columns:
    metrics.append(rmse(train_target.loc[:, _target], res.loc[:, _target]))

In [None]:
print(f'OOF Metric: {np.mean(metrics)}')

In [None]:
preds_df

In [None]:
sub = pd.read_csv('../input/stanford-covid-vaccine/sample_submission.csv')
sub = pd.merge(sub[['id_seqpos']], preds_df, left_on='id_seqpos', right_on='id', how='left').drop(['id'],axis=1)
sub = sub.fillna(0)
sub.tail(30)

In [None]:
sub.to_csv('./submission.csv', index=False)