In [1]:
import pandas as pd
import numpy as np
import json
import tensorflow.keras.layers as L
import tensorflow as tf
import plotly.express as px

## Define helper functions and useful vars

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="7"

In [3]:
# This will tell us the columns we are predicting
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [4]:
# model = Classifier((None, 11))

In [5]:
def gru_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.GRU(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.LSTM(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def build_model(gru=False,seq_len=107, pred_len=68, dropout=0.25,
                embed_dim=128, hidden_dim=384):
    
    inputs = tf.keras.layers.Input(shape=(seq_len, 3))
    inputs_bpps = tf.keras.layers.Input(shape=(seq_len, 1))
    
    bpps = tf.keras.layers.Dense(embed_dim, activation='relu')(inputs_bpps)
    
    
    token_embed = tf.keras.layers.Embedding(input_dim=len(token2int), output_dim=embed_dim)
    query_embed = tf.keras.layers.Embedding(input_dim=len(token2int), output_dim=embed_dim)
    
    t_embed = token_embed(inputs)
    q_embed = query_embed(inputs)
    
    t_reshaped = tf.reshape(
        t_embed, shape=(-1, t_embed.shape[1],  t_embed.shape[2] * t_embed.shape[3]))
    
    t_reshaped = tf.keras.layers.SpatialDropout1D(.2)(t_reshaped)
    
    t_reshaped = tf.concat([t_reshaped, bpps], axis=2)
    
    t_hidden = gru_layer(hidden_dim, dropout)(t_reshaped)
    t_hidden = gru_layer(hidden_dim, dropout)(t_hidden)
    t_hidden = gru_layer(hidden_dim, dropout)(t_hidden)
    
    q_reshaped = tf.reshape(
        q_embed, shape=(-1, q_embed.shape[1],  q_embed.shape[2] * q_embed.shape[3]))
    
    q_reshaped = tf.keras.layers.SpatialDropout1D(.2)(q_reshaped)
    
    q_reshaped = tf.concat([q_reshaped, bpps], axis=2)
    
    q_hidden = gru_layer(hidden_dim, dropout)(q_reshaped)
    q_hidden = gru_layer(hidden_dim, dropout)(q_hidden)
    q_hidden = gru_layer(hidden_dim, dropout)(q_hidden)
    
    query_value_attention_seq = tf.keras.layers.Attention()(
    [t_hidden, q_hidden])
    
    
    
    #query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
    #    q_hidden)
    #query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
    #    query_value_attention_seq)

    hidden = tf.keras.layers.Concatenate()(
        [q_hidden, query_value_attention_seq])
    
    #hidden = tf.keras.layers.Concatenate()(
    #    [q_hidden, query_value_attention_seq, bpps])
    hidden = tf.keras.layers.Dense(768, activation='relu')(hidden)
    
    
    #only making predictions on the first part of each sequence
    truncated = hidden[:, :pred_len]
    
    out = tf.keras.layers.Dense(5)(truncated)

    model = tf.keras.Model(inputs=[inputs, inputs_bpps], outputs=out)

    #some optimizers
    adam = tf.optimizers.Adam()
    
    model.compile(optimizer = adam, loss='mse')
    
    return model

In [6]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

## Load and preprocess data

In [7]:
train = pd.read_json('../input//train.json', lines=True)
test = pd.read_json('../input//test.json', lines=True)
sample_df = pd.read_csv('../input//sample_submission.csv')

In [8]:
train['id'][0]

'id_001f94081'

In [9]:
foo = np.load('../input/bpps/id_001f94081.npy')

In [10]:
1 - foo.sum(1)

array([0.80145771, 0.8162878 , 0.9399976 , 0.98687779, 0.98872014,
       0.8726042 , 0.64923491, 0.45909952, 0.38734203, 0.37291085,
       0.48373307, 0.89664275, 0.90951632, 0.90861451, 0.98940993,
       0.98145491, 0.81965108, 0.79170963, 0.38557835, 0.52880297,
       0.5372444 , 0.52122251, 0.90823082, 0.40080513, 0.43650824,
       0.54643881, 0.38305727, 0.53568993, 0.67795352, 0.8628183 ,
       0.78497647, 0.83994244, 0.78492865, 0.70795306, 0.73116841,
       0.82452095, 0.86113227, 0.52072924, 0.37224691, 0.20266188,
       0.26856484, 0.39608598, 0.14207184, 0.65970159, 0.65338135,
       0.80168215, 0.97622094, 0.40194761, 0.28440304, 0.09579655,
       0.23852457, 0.80521037, 0.70261301, 0.81546441, 0.94987859,
       0.92402902, 0.76050376, 0.63090152, 0.77919177, 0.73839201,
       0.61416194, 0.70487221, 0.35752507, 0.40452985, 0.65327547,
       0.58192038, 0.92482731, 0.95905864, 0.13151534, 0.09435281,
       0.0744179 , 0.05975572, 0.04247293, 0.05214685, 0.09151

In [11]:
#target columns
target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [12]:
train_inputs = preprocess_inputs(train[train.signal_to_noise > 1])
train_labels = np.array(train[train.signal_to_noise > 1][target_cols].values.tolist()).transpose((0, 2, 1))

In [13]:
train_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in train['id']])
train_bpps = train_bpps[train.signal_to_noise > 1][:, :, np.newaxis]

In [14]:
from sklearn.model_selection import KFold

In [15]:
train_labels.shape

(2096, 68, 5)

In [16]:
np.zeros_like(train_labels).shape

(2096, 68, 5)

In [17]:
FOLDS = KFold(n_splits=5, random_state=815, shuffle=True)

oofs_pred = np.zeros_like(train_labels)
public_preds_array = []
public_preds_array = []

for i, (trn_idx, vld_idx) in enumerate(FOLDS.split(train_inputs)):
    trn_inputs = train_inputs[trn_idx]
    vld_inputs = train_inputs[vld_idx]
    
    trn_inputs_bpps = train_bpps[trn_idx]
    vld_inputs_bpps = train_bpps[vld_idx]

    trn_labels = train_labels[trn_idx]
    vld_labels = train_labels[vld_idx]

    model = build_model()
    model.summary()

    history = model.fit(
        [trn_inputs, trn_inputs_bpps], trn_labels, 
        validation_data=([vld_inputs, vld_inputs_bpps], vld_labels),
        batch_size=32,
        epochs=120,
        callbacks=[
            tf.keras.callbacks.ReduceLROnPlateau(),
            tf.keras.callbacks.ModelCheckpoint('bpps_large_attention_relu_dense_815.h5')
        ],
        verbose=2,
    )
    model.load_weights('./bpps_large_attention_relu_dense_815.h5')
    outputs = model.predict([vld_inputs, vld_inputs_bpps])
    oofs_pred[vld_idx] = outputs
    
    from sklearn.metrics import mean_squared_error
    errors = []
    for idx in range(5):
         errors.append(np.sqrt(mean_squared_error(vld_labels[:, idx], outputs[:, idx])))
    final_error = np.mean(errors)
    print('#'*20, final_error)

    public_df = test.query("seq_length == 107").copy()
    private_df = test.query("seq_length == 130").copy()

    public_inputs = preprocess_inputs(public_df)
    private_inputs = preprocess_inputs(private_df)
    
    public_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in public_df['id']])
    public_bpps = public_bpps[:, :, np.newaxis]
    
    private_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in private_df['id']])
    private_bpps = private_bpps[:, :, np.newaxis] 

    # Caveat: The prediction format requires the output to be the same length as the input,
    # although it's not the case for the training data.
    model_short = build_model(seq_len=107, pred_len=107)
    model_long = build_model(seq_len=130, pred_len=130)

    model_short.load_weights('bpps_large_attention_relu_dense_815.h5')
    model_long.load_weights('bpps_large_attention_relu_dense_815.h5')

    public_preds = model_short.predict([public_inputs, public_bpps])
    private_preds = model_long.predict([private_inputs,private_bpps])
    
    public_preds_array.append(public_preds)
    public_preds_array.append(private_preds)

    print(public_preds.shape, private_preds.shape)

    preds_ls = []

    for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
        for idx, uid in enumerate(df.id):
            single_pred = preds[idx]

            single_df = pd.DataFrame(single_pred, columns=pred_cols)
            single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

            preds_ls.append(single_df)

    preds_df = pd.concat(preds_ls)

    submission = sample_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
    submission.to_csv(f'submission_bpps_large_attention_relu_dense_815_{i}.csv', index=False)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 107, 3)]     0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 107, 3, 128)  1792        input_1[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, 107, 3, 128)  1792        input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_Reshape_1 (TensorFl [(None, 107, 384)]   0           embedding_1[0][0]                
_______________________________________________________________________________________

Epoch 52/120
53/53 - 6s - loss: 0.0160 - val_loss: 0.0571
Epoch 53/120
53/53 - 7s - loss: 0.0159 - val_loss: 0.0570
Epoch 54/120
53/53 - 7s - loss: 0.0157 - val_loss: 0.0570
Epoch 55/120
53/53 - 7s - loss: 0.0156 - val_loss: 0.0571
Epoch 56/120
53/53 - 7s - loss: 0.0155 - val_loss: 0.0571
Epoch 57/120
53/53 - 8s - loss: 0.0153 - val_loss: 0.0570
Epoch 58/120
53/53 - 7s - loss: 0.0152 - val_loss: 0.0571
Epoch 59/120
53/53 - 7s - loss: 0.0151 - val_loss: 0.0573
Epoch 60/120
53/53 - 7s - loss: 0.0149 - val_loss: 0.0573
Epoch 61/120
53/53 - 7s - loss: 0.0149 - val_loss: 0.0573
Epoch 62/120
53/53 - 8s - loss: 0.0148 - val_loss: 0.0572
Epoch 63/120
53/53 - 8s - loss: 0.0148 - val_loss: 0.0572
Epoch 64/120
53/53 - 8s - loss: 0.0148 - val_loss: 0.0572
Epoch 65/120
53/53 - 7s - loss: 0.0148 - val_loss: 0.0572
Epoch 66/120
53/53 - 8s - loss: 0.0148 - val_loss: 0.0572
Epoch 67/120
53/53 - 8s - loss: 0.0147 - val_loss: 0.0572
Epoch 68/120
53/53 - 8s - loss: 0.0148 - val_loss: 0.0573
Epoch 69/120
5

53/53 - 10s - loss: 0.2066 - val_loss: 0.1513
Epoch 2/120
53/53 - 8s - loss: 0.1378 - val_loss: 0.1235
Epoch 3/120
53/53 - 8s - loss: 0.1227 - val_loss: 0.1156
Epoch 4/120
53/53 - 7s - loss: 0.1145 - val_loss: 0.1113
Epoch 5/120
53/53 - 8s - loss: 0.1080 - val_loss: 0.1019
Epoch 6/120
53/53 - 8s - loss: 0.0983 - val_loss: 0.0950
Epoch 7/120
53/53 - 8s - loss: 0.0917 - val_loss: 0.0840
Epoch 8/120
53/53 - 8s - loss: 0.0833 - val_loss: 0.0795
Epoch 9/120
53/53 - 8s - loss: 0.0794 - val_loss: 0.0764
Epoch 10/120
53/53 - 7s - loss: 0.0754 - val_loss: 0.0724
Epoch 11/120
53/53 - 8s - loss: 0.0697 - val_loss: 0.0716
Epoch 12/120
53/53 - 8s - loss: 0.0665 - val_loss: 0.0692
Epoch 13/120
53/53 - 8s - loss: 0.0648 - val_loss: 0.0674
Epoch 14/120
53/53 - 8s - loss: 0.0613 - val_loss: 0.0668
Epoch 15/120
53/53 - 8s - loss: 0.0586 - val_loss: 0.0673
Epoch 16/120
53/53 - 7s - loss: 0.0558 - val_loss: 0.0679
Epoch 17/120
53/53 - 8s - loss: 0.0525 - val_loss: 0.0657
Epoch 18/120
53/53 - 8s - loss: 0.

53/53 - 10s - loss: 0.2212 - val_loss: 0.1590
Epoch 2/120
53/53 - 8s - loss: 0.1421 - val_loss: 0.1284
Epoch 3/120
53/53 - 8s - loss: 0.1228 - val_loss: 0.1175
Epoch 4/120
53/53 - 8s - loss: 0.1142 - val_loss: 0.1148
Epoch 5/120
53/53 - 7s - loss: 0.1051 - val_loss: 0.0986
Epoch 6/120
53/53 - 8s - loss: 0.0940 - val_loss: 0.0910
Epoch 7/120
53/53 - 9s - loss: 0.0871 - val_loss: 0.0853
Epoch 8/120
53/53 - 7s - loss: 0.0817 - val_loss: 0.0820
Epoch 9/120
53/53 - 7s - loss: 0.0778 - val_loss: 0.0816
Epoch 10/120
53/53 - 7s - loss: 0.0741 - val_loss: 0.0772
Epoch 11/120
53/53 - 7s - loss: 0.0701 - val_loss: 0.0774
Epoch 12/120
53/53 - 7s - loss: 0.0663 - val_loss: 0.0726
Epoch 13/120
53/53 - 7s - loss: 0.0632 - val_loss: 0.0736
Epoch 14/120
53/53 - 7s - loss: 0.0618 - val_loss: 0.0722
Epoch 15/120
53/53 - 8s - loss: 0.0602 - val_loss: 0.0722
Epoch 16/120
53/53 - 8s - loss: 0.0587 - val_loss: 0.0699
Epoch 17/120
53/53 - 7s - loss: 0.0541 - val_loss: 0.0700
Epoch 18/120
53/53 - 8s - loss: 0.

53/53 - 9s - loss: 0.2904 - val_loss: 0.1604
Epoch 2/120
53/53 - 6s - loss: 0.1487 - val_loss: 0.1285
Epoch 3/120
53/53 - 6s - loss: 0.1235 - val_loss: 0.1155
Epoch 4/120
53/53 - 6s - loss: 0.1123 - val_loss: 0.1084
Epoch 5/120
53/53 - 7s - loss: 0.1070 - val_loss: 0.1045
Epoch 6/120
53/53 - 6s - loss: 0.1011 - val_loss: 0.1008
Epoch 7/120
53/53 - 7s - loss: 0.0938 - val_loss: 0.0932
Epoch 8/120
53/53 - 6s - loss: 0.0877 - val_loss: 0.0875
Epoch 9/120
53/53 - 7s - loss: 0.0831 - val_loss: 0.0813
Epoch 10/120
53/53 - 6s - loss: 0.0788 - val_loss: 0.0795
Epoch 11/120
53/53 - 7s - loss: 0.0751 - val_loss: 0.0776
Epoch 12/120
53/53 - 7s - loss: 0.0712 - val_loss: 0.0732
Epoch 13/120
53/53 - 7s - loss: 0.0676 - val_loss: 0.0728
Epoch 14/120
53/53 - 7s - loss: 0.0648 - val_loss: 0.0703
Epoch 15/120
53/53 - 7s - loss: 0.0622 - val_loss: 0.0695
Epoch 16/120
53/53 - 7s - loss: 0.0595 - val_loss: 0.0705
Epoch 17/120
53/53 - 7s - loss: 0.0563 - val_loss: 0.0686
Epoch 18/120
53/53 - 7s - loss: 0.0

53/53 - 9s - loss: 0.2692 - val_loss: 0.1601
Epoch 2/120
53/53 - 7s - loss: 0.1459 - val_loss: 0.1313
Epoch 3/120
53/53 - 6s - loss: 0.1230 - val_loss: 0.1220
Epoch 4/120
53/53 - 7s - loss: 0.1125 - val_loss: 0.1126
Epoch 5/120
53/53 - 7s - loss: 0.1054 - val_loss: 0.1042
Epoch 6/120
53/53 - 7s - loss: 0.1001 - val_loss: 0.1042
Epoch 7/120
53/53 - 7s - loss: 0.0942 - val_loss: 0.0901
Epoch 8/120
53/53 - 6s - loss: 0.0863 - val_loss: 0.0833
Epoch 9/120
53/53 - 7s - loss: 0.0812 - val_loss: 0.0835
Epoch 10/120
53/53 - 6s - loss: 0.0780 - val_loss: 0.0779
Epoch 11/120
53/53 - 6s - loss: 0.0737 - val_loss: 0.0753
Epoch 12/120
53/53 - 6s - loss: 0.0707 - val_loss: 0.0774
Epoch 13/120
53/53 - 6s - loss: 0.0674 - val_loss: 0.0728
Epoch 14/120
53/53 - 7s - loss: 0.0665 - val_loss: 0.0712
Epoch 15/120
53/53 - 7s - loss: 0.0618 - val_loss: 0.0682
Epoch 16/120
53/53 - 6s - loss: 0.0589 - val_loss: 0.0692
Epoch 17/120
53/53 - 6s - loss: 0.0567 - val_loss: 0.0685
Epoch 18/120
53/53 - 7s - loss: 0.0

In [18]:
# for i, uid in enumerate(train.id):
#     single_pred = oofs_pred[i]

#     oof_df = pd.DataFrame(single_pred, columns=pred_cols)
#     oof_df['id_seqpos'] = [f'{uid}_{x}' for x in range(oof_df.shape[0])]

In [19]:
model.roi_heads.box_head(x)

AttributeError: 'Functional' object has no attribute 'roi_heads'