In [1]:
import pandas as pd
import numpy as np
import json
import tensorflow.keras.layers as L
import tensorflow as tf
import plotly.express as px

## Define helper functions and useful vars

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [3]:
# This will tell us the columns we are predicting
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [4]:
# model = Classifier((None, 11))

In [5]:
def gru_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.GRU(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.LSTM(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def build_model(gru=False,seq_len=107, pred_len=68, dropout=0.25,
                embed_dim=128, hidden_dim=384):
    
    inputs = tf.keras.layers.Input(shape=(seq_len, 3))
    inputs_bpps = tf.keras.layers.Input(shape=(seq_len, 1))
    
    bpps = tf.keras.layers.Dense(embed_dim, activation='relu')(inputs_bpps)
    
    
    token_embed = tf.keras.layers.Embedding(input_dim=len(token2int), output_dim=embed_dim)
    query_embed = tf.keras.layers.Embedding(input_dim=len(token2int), output_dim=embed_dim)
    
    t_embed = token_embed(inputs)
    q_embed = query_embed(inputs)
    
    t_reshaped = tf.reshape(
        t_embed, shape=(-1, t_embed.shape[1],  t_embed.shape[2] * t_embed.shape[3]))
    
    t_reshaped = tf.keras.layers.SpatialDropout1D(.2)(t_reshaped)
    
    t_reshaped = tf.concat([t_reshaped, bpps], axis=2)
    
    t_hidden = gru_layer(hidden_dim, dropout)(t_reshaped)
    t_hidden = gru_layer(hidden_dim, dropout)(t_hidden)
    t_hidden = gru_layer(hidden_dim, dropout)(t_hidden)
    
    q_reshaped = tf.reshape(
        q_embed, shape=(-1, q_embed.shape[1],  q_embed.shape[2] * q_embed.shape[3]))
    
    q_reshaped = tf.keras.layers.SpatialDropout1D(.2)(q_reshaped)
    
    q_reshaped = tf.concat([q_reshaped, bpps], axis=2)
    
    q_hidden = gru_layer(hidden_dim, dropout)(q_reshaped)
    q_hidden = gru_layer(hidden_dim, dropout)(q_hidden)
    q_hidden = gru_layer(hidden_dim, dropout)(q_hidden)
    
    query_value_attention_seq = tf.keras.layers.Attention()(
    [t_hidden, q_hidden])
    
    #query_encoding = tf.keras.layers.GlobalAveragePooling1D()(
    #    q_hidden)
    #query_value_attention = tf.keras.layers.GlobalAveragePooling1D()(
    #    query_value_attention_seq)

    hidden = tf.keras.layers.Concatenate()(
        [q_hidden, query_value_attention_seq])
    
    #only making predictions on the first part of each sequence
    truncated = hidden[:, :pred_len]
    
    out = tf.keras.layers.Dense(5)(truncated)

    model = tf.keras.Model(inputs=[inputs, inputs_bpps], outputs=out)

    #some optimizers
    adam = tf.optimizers.Adam()
    def MCRMSE(y_true, y_pred):
        colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
        return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)
    model.compile(optimizer = adam, loss=MCRMSE)
    
    return model

In [6]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

## Load and preprocess data

In [7]:
train = pd.read_json('../input//train.json', lines=True)
test = pd.read_json('../input//test.json', lines=True)
sample_df = pd.read_csv('../input//sample_submission.csv')

In [8]:
train['id'][0]

'id_001f94081'

In [9]:
foo = np.load('../input/bpps/id_001f94081.npy')

In [10]:
1 - foo.sum(1)

array([0.80145771, 0.8162878 , 0.9399976 , 0.98687779, 0.98872014,
       0.8726042 , 0.64923491, 0.45909952, 0.38734203, 0.37291085,
       0.48373307, 0.89664275, 0.90951632, 0.90861451, 0.98940993,
       0.98145491, 0.81965108, 0.79170963, 0.38557835, 0.52880297,
       0.5372444 , 0.52122251, 0.90823082, 0.40080513, 0.43650824,
       0.54643881, 0.38305727, 0.53568993, 0.67795352, 0.8628183 ,
       0.78497647, 0.83994244, 0.78492865, 0.70795306, 0.73116841,
       0.82452095, 0.86113227, 0.52072924, 0.37224691, 0.20266188,
       0.26856484, 0.39608598, 0.14207184, 0.65970159, 0.65338135,
       0.80168215, 0.97622094, 0.40194761, 0.28440304, 0.09579655,
       0.23852457, 0.80521037, 0.70261301, 0.81546441, 0.94987859,
       0.92402902, 0.76050376, 0.63090152, 0.77919177, 0.73839201,
       0.61416194, 0.70487221, 0.35752507, 0.40452985, 0.65327547,
       0.58192038, 0.92482731, 0.95905864, 0.13151534, 0.09435281,
       0.0744179 , 0.05975572, 0.04247293, 0.05214685, 0.09151

In [11]:
#target columns
target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [12]:
train_inputs = preprocess_inputs(train[train.signal_to_noise > 1])
train_labels = np.array(train[train.signal_to_noise > 1][target_cols].values.tolist()).transpose((0, 2, 1))

In [13]:
train_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in train['id']])
train_bpps = train_bpps[train.signal_to_noise > 1][:, :, np.newaxis]

In [14]:
from sklearn.model_selection import KFold

In [15]:
train_labels.shape

(2096, 68, 5)

In [16]:
np.zeros_like(train_labels).shape

(2096, 68, 5)

In [17]:
FOLDS = KFold(n_splits=5, random_state=815, shuffle=True)

oofs_pred = np.zeros_like(train_labels)
public_preds_array = []
public_preds_array = []

for i, (trn_idx, vld_idx) in enumerate(FOLDS.split(train_inputs)):
    trn_inputs = train_inputs[trn_idx]
    vld_inputs = train_inputs[vld_idx]
    
    trn_inputs_bpps = train_bpps[trn_idx]
    vld_inputs_bpps = train_bpps[vld_idx]

    trn_labels = train_labels[trn_idx]
    vld_labels = train_labels[vld_idx]

    model = build_model()
    model.summary()

    history = model.fit(
        [trn_inputs, trn_inputs_bpps], trn_labels, 
        validation_data=([vld_inputs, vld_inputs_bpps], vld_labels),
        batch_size=32,
        epochs=120,
        callbacks=[
            tf.keras.callbacks.ReduceLROnPlateau(),
            tf.keras.callbacks.ModelCheckpoint('tf_simple_lstm_large_noise_more_epochs_bpps_large_attention_relu_newloss_815.h5')
        ],
        verbose=2,
    )
    model.load_weights('./tf_simple_lstm_large_noise_more_epochs_bpps_large_attention_relu_newloss_815.h5')
    outputs = model.predict([vld_inputs, vld_inputs_bpps])
    oofs_pred[vld_idx] = outputs
    
    from sklearn.metrics import mean_squared_error
    errors = []
    for idx in range(5):
         errors.append(np.sqrt(mean_squared_error(vld_labels[:, idx], outputs[:, idx])))
    final_error = np.mean(errors)
    print('#'*20, final_error)

    public_df = test.query("seq_length == 107").copy()
    private_df = test.query("seq_length == 130").copy()

    public_inputs = preprocess_inputs(public_df)
    private_inputs = preprocess_inputs(private_df)
    
    public_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in public_df['id']])
    public_bpps = public_bpps[:, :, np.newaxis]
    
    private_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in private_df['id']])
    private_bpps = private_bpps[:, :, np.newaxis] 

    # Caveat: The prediction format requires the output to be the same length as the input,
    # although it's not the case for the training data.
    model_short = build_model(seq_len=107, pred_len=107)
    model_long = build_model(seq_len=130, pred_len=130)

    model_short.load_weights('tf_simple_lstm_large_noise_more_epochs_bpps_large_attention_relu_newloss_815.h5')
    model_long.load_weights('tf_simple_lstm_large_noise_more_epochs_bpps_large_attention_relu_newloss_815.h5')

    public_preds = model_short.predict([public_inputs, public_bpps])
    private_preds = model_long.predict([private_inputs,private_bpps])
    
    public_preds_array.append(public_preds)
    public_preds_array.append(private_preds)

    print(public_preds.shape, private_preds.shape)

    preds_ls = []

    for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
        for idx, uid in enumerate(df.id):
            single_pred = preds[idx]

            single_df = pd.DataFrame(single_pred, columns=pred_cols)
            single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

            preds_ls.append(single_df)

    preds_df = pd.concat(preds_ls)

    submission = sample_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
    submission.to_csv(f'submission_tf_simple_lstm_large_noise_more_epochs_bpps_large_attention_relu_newloss_815_{i}.csv', index=False)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 107, 3)]     0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 107, 3, 128)  1792        input_1[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, 107, 3, 128)  1792        input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_Reshape_1 (TensorFl [(None, 107, 384)]   0           embedding_1[0][0]                
_______________________________________________________________________________________

Epoch 56/120
53/53 - 8s - loss: 0.1300 - val_loss: 0.2137
Epoch 57/120
53/53 - 8s - loss: 0.1251 - val_loss: 0.2138
Epoch 58/120
53/53 - 8s - loss: 0.1236 - val_loss: 0.2136
Epoch 59/120
53/53 - 7s - loss: 0.1222 - val_loss: 0.2135
Epoch 60/120
53/53 - 7s - loss: 0.1211 - val_loss: 0.2135
Epoch 61/120
53/53 - 7s - loss: 0.1204 - val_loss: 0.2136
Epoch 62/120
53/53 - 7s - loss: 0.1197 - val_loss: 0.2140
Epoch 63/120
53/53 - 7s - loss: 0.1190 - val_loss: 0.2136
Epoch 64/120
53/53 - 8s - loss: 0.1186 - val_loss: 0.2136
Epoch 65/120
53/53 - 7s - loss: 0.1182 - val_loss: 0.2137
Epoch 66/120
53/53 - 8s - loss: 0.1176 - val_loss: 0.2137
Epoch 67/120
53/53 - 7s - loss: 0.1172 - val_loss: 0.2136
Epoch 68/120
53/53 - 7s - loss: 0.1170 - val_loss: 0.2139
Epoch 69/120
53/53 - 7s - loss: 0.1160 - val_loss: 0.2136
Epoch 70/120
53/53 - 7s - loss: 0.1159 - val_loss: 0.2137
Epoch 71/120
53/53 - 7s - loss: 0.1154 - val_loss: 0.2137
Epoch 72/120
53/53 - 7s - loss: 0.1157 - val_loss: 0.2137
Epoch 73/120
5

53/53 - 10s - loss: 0.4126 - val_loss: 0.3602
Epoch 2/120
53/53 - 8s - loss: 0.3540 - val_loss: 0.3339
Epoch 3/120
53/53 - 8s - loss: 0.3290 - val_loss: 0.3163
Epoch 4/120
53/53 - 8s - loss: 0.3194 - val_loss: 0.3042
Epoch 5/120
53/53 - 8s - loss: 0.3058 - val_loss: 0.2893
Epoch 6/120
53/53 - 7s - loss: 0.2938 - val_loss: 0.2813
Epoch 7/120
53/53 - 7s - loss: 0.2802 - val_loss: 0.2667
Epoch 8/120
53/53 - 7s - loss: 0.2701 - val_loss: 0.2594
Epoch 9/120
53/53 - 7s - loss: 0.2633 - val_loss: 0.2551
Epoch 10/120
53/53 - 7s - loss: 0.2531 - val_loss: 0.2502
Epoch 11/120
53/53 - 7s - loss: 0.2468 - val_loss: 0.2401
Epoch 12/120
53/53 - 7s - loss: 0.2435 - val_loss: 0.2378
Epoch 13/120
53/53 - 8s - loss: 0.2364 - val_loss: 0.2356
Epoch 14/120
53/53 - 8s - loss: 0.2304 - val_loss: 0.2312
Epoch 15/120
53/53 - 7s - loss: 0.2253 - val_loss: 0.2290
Epoch 16/120
53/53 - 7s - loss: 0.2219 - val_loss: 0.2320
Epoch 17/120
53/53 - 7s - loss: 0.2176 - val_loss: 0.2264
Epoch 18/120
53/53 - 7s - loss: 0.

53/53 - 12s - loss: 0.4079 - val_loss: 0.3622
Epoch 2/120
53/53 - 8s - loss: 0.3457 - val_loss: 0.3352
Epoch 3/120
53/53 - 8s - loss: 0.3256 - val_loss: 0.3171
Epoch 4/120
53/53 - 8s - loss: 0.3129 - val_loss: 0.3045
Epoch 5/120
53/53 - 8s - loss: 0.3022 - val_loss: 0.2906
Epoch 6/120
53/53 - 7s - loss: 0.2902 - val_loss: 0.2805
Epoch 7/120
53/53 - 8s - loss: 0.2760 - val_loss: 0.2698
Epoch 8/120
53/53 - 7s - loss: 0.2653 - val_loss: 0.2654
Epoch 9/120
53/53 - 7s - loss: 0.2569 - val_loss: 0.2525
Epoch 10/120
53/53 - 7s - loss: 0.2476 - val_loss: 0.2484
Epoch 11/120
53/53 - 8s - loss: 0.2421 - val_loss: 0.2415
Epoch 12/120
53/53 - 8s - loss: 0.2366 - val_loss: 0.2377
Epoch 13/120
53/53 - 7s - loss: 0.2304 - val_loss: 0.2365
Epoch 14/120
53/53 - 8s - loss: 0.2273 - val_loss: 0.2383
Epoch 15/120
53/53 - 8s - loss: 0.2218 - val_loss: 0.2314
Epoch 16/120
53/53 - 7s - loss: 0.2143 - val_loss: 0.2311
Epoch 17/120
53/53 - 7s - loss: 0.2109 - val_loss: 0.2293
Epoch 18/120
53/53 - 7s - loss: 0.

53/53 - 10s - loss: 0.4101 - val_loss: 0.3704
Epoch 2/120
53/53 - 6s - loss: 0.3467 - val_loss: 0.3321
Epoch 3/120
53/53 - 6s - loss: 0.3257 - val_loss: 0.3229
Epoch 4/120
53/53 - 6s - loss: 0.3140 - val_loss: 0.3047
Epoch 5/120
53/53 - 6s - loss: 0.2979 - val_loss: 0.2913
Epoch 6/120
53/53 - 6s - loss: 0.2846 - val_loss: 0.2829
Epoch 7/120
53/53 - 6s - loss: 0.2741 - val_loss: 0.2707
Epoch 8/120
53/53 - 6s - loss: 0.2644 - val_loss: 0.2633
Epoch 9/120
53/53 - 6s - loss: 0.2537 - val_loss: 0.2627
Epoch 10/120
53/53 - 7s - loss: 0.2478 - val_loss: 0.2505
Epoch 11/120
53/53 - 6s - loss: 0.2418 - val_loss: 0.2439
Epoch 12/120
53/53 - 6s - loss: 0.2356 - val_loss: 0.2455
Epoch 13/120
53/53 - 6s - loss: 0.2300 - val_loss: 0.2361
Epoch 14/120
53/53 - 6s - loss: 0.2254 - val_loss: 0.2373
Epoch 15/120
53/53 - 6s - loss: 0.2224 - val_loss: 0.2374
Epoch 16/120
53/53 - 6s - loss: 0.2156 - val_loss: 0.2316
Epoch 17/120
53/53 - 6s - loss: 0.2103 - val_loss: 0.2295
Epoch 18/120
53/53 - 6s - loss: 0.

53/53 - 9s - loss: 0.4295 - val_loss: 0.3763
Epoch 2/120
53/53 - 6s - loss: 0.3525 - val_loss: 0.3340
Epoch 3/120
53/53 - 6s - loss: 0.3279 - val_loss: 0.3217
Epoch 4/120
53/53 - 6s - loss: 0.3177 - val_loss: 0.3144
Epoch 5/120
53/53 - 6s - loss: 0.3051 - val_loss: 0.3004
Epoch 6/120
53/53 - 6s - loss: 0.2922 - val_loss: 0.2908
Epoch 7/120
53/53 - 6s - loss: 0.2824 - val_loss: 0.2790
Epoch 8/120
53/53 - 6s - loss: 0.2748 - val_loss: 0.2651
Epoch 9/120
53/53 - 6s - loss: 0.2640 - val_loss: 0.2582
Epoch 10/120
53/53 - 6s - loss: 0.2554 - val_loss: 0.2558
Epoch 11/120
53/53 - 6s - loss: 0.2518 - val_loss: 0.2511
Epoch 12/120
53/53 - 6s - loss: 0.2436 - val_loss: 0.2482
Epoch 13/120
53/53 - 6s - loss: 0.2384 - val_loss: 0.2407
Epoch 14/120
53/53 - 6s - loss: 0.2327 - val_loss: 0.2435
Epoch 15/120
53/53 - 6s - loss: 0.2267 - val_loss: 0.2398
Epoch 16/120
53/53 - 6s - loss: 0.2230 - val_loss: 0.2365
Epoch 17/120
53/53 - 6s - loss: 0.2195 - val_loss: 0.2339
Epoch 18/120
53/53 - 6s - loss: 0.2

In [18]:
# for i, uid in enumerate(train.id):
#     single_pred = oofs_pred[i]

#     oof_df = pd.DataFrame(single_pred, columns=pred_cols)
#     oof_df['id_seqpos'] = [f'{uid}_{x}' for x in range(oof_df.shape[0])]

In [19]:
model.roi_heads.box_head(x)

AttributeError: 'Functional' object has no attribute 'roi_heads'