In [1]:
import pandas as pd
import numpy as np
import json
import tensorflow.keras.layers as L
import tensorflow as tf
import plotly.express as px

## Define helper functions and useful vars

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [3]:
# This will tell us the columns we are predicting
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [4]:
def gru_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.GRU(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.LSTM(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def build_model(gru=False,seq_len=107, pred_len=68, dropout=0.25,
                embed_dim=128, hidden_dim=384):
    
    inputs = tf.keras.layers.Input(shape=(seq_len, 3))
    
    inputs_bpps = tf.keras.layers.Input(shape=(seq_len, 1))

    embed = tf.keras.layers.Embedding(input_dim=len(token2int), output_dim=embed_dim)(inputs)
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))
    
    reshaped = tf.keras.layers.SpatialDropout1D(.2)(reshaped)
    bpps = tf.keras.layers.Dense(embed_dim, activation='linear')(inputs_bpps)
    
    reshaped = tf.concat([reshaped, bpps], axis=2)
    
    if gru:
        hidden = gru_layer(hidden_dim, dropout)(reshaped)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
        
    else:
        hidden = lstm_layer(hidden_dim, dropout)(reshaped)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
    
    #only making predictions on the first part of each sequence
    truncated = hidden[:, :pred_len]
    
    out = tf.keras.layers.Dense(5, activation='linear')(truncated)

    model = tf.keras.Model(inputs=[inputs, inputs_bpps], outputs=out)

    #some optimizers
    adam = tf.optimizers.Adam()
    def MCRMSE(y_true, y_pred):
        colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
        return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)
    
    model.compile(optimizer = adam, loss=MCRMSE)
    
    return model

In [5]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

## Load and preprocess data

In [6]:
train = pd.read_json('../input//train.json', lines=True)
test = pd.read_json('../input//test.json', lines=True)
sample_df = pd.read_csv('../input//sample_submission.csv')

In [7]:
train['id'][0]

'id_001f94081'

In [8]:
foo = np.load('../input/bpps/id_001f94081.npy')

In [9]:
1 - foo.sum(1)

array([0.80145771, 0.8162878 , 0.9399976 , 0.98687779, 0.98872014,
       0.8726042 , 0.64923491, 0.45909952, 0.38734203, 0.37291085,
       0.48373307, 0.89664275, 0.90951632, 0.90861451, 0.98940993,
       0.98145491, 0.81965108, 0.79170963, 0.38557835, 0.52880297,
       0.5372444 , 0.52122251, 0.90823082, 0.40080513, 0.43650824,
       0.54643881, 0.38305727, 0.53568993, 0.67795352, 0.8628183 ,
       0.78497647, 0.83994244, 0.78492865, 0.70795306, 0.73116841,
       0.82452095, 0.86113227, 0.52072924, 0.37224691, 0.20266188,
       0.26856484, 0.39608598, 0.14207184, 0.65970159, 0.65338135,
       0.80168215, 0.97622094, 0.40194761, 0.28440304, 0.09579655,
       0.23852457, 0.80521037, 0.70261301, 0.81546441, 0.94987859,
       0.92402902, 0.76050376, 0.63090152, 0.77919177, 0.73839201,
       0.61416194, 0.70487221, 0.35752507, 0.40452985, 0.65327547,
       0.58192038, 0.92482731, 0.95905864, 0.13151534, 0.09435281,
       0.0744179 , 0.05975572, 0.04247293, 0.05214685, 0.09151

In [10]:
#target columns
target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [11]:
train_inputs = preprocess_inputs(train[train.signal_to_noise > 1])
train_labels = np.array(train[train.signal_to_noise > 1][target_cols].values.tolist()).transpose((0, 2, 1))

In [12]:
train_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in train['id']])
train_bpps = train_bpps[train.signal_to_noise > 1][:, :, np.newaxis]

In [13]:
from sklearn.model_selection import KFold

In [14]:
train_labels.shape

(2096, 68, 5)

In [15]:
np.zeros_like(train_labels).shape

(2096, 68, 5)

In [16]:
FOLDS = KFold(n_splits=5, random_state=815, shuffle=True)

oofs_pred = np.zeros_like(train_labels)
public_preds_array = []
public_preds_array = []

for i, (trn_idx, vld_idx) in enumerate(FOLDS.split(train_inputs)):
    trn_inputs = train_inputs[trn_idx]
    vld_inputs = train_inputs[vld_idx]
    
    trn_inputs_bpps = train_bpps[trn_idx]
    vld_inputs_bpps = train_bpps[vld_idx]

    trn_labels = train_labels[trn_idx]
    vld_labels = train_labels[vld_idx]

    model = build_model()
    model.summary()

    history = model.fit(
        [trn_inputs, trn_inputs_bpps], trn_labels, 
        validation_data=([vld_inputs, vld_inputs_bpps], vld_labels),
        batch_size=32,
        epochs=120,
        callbacks=[
            tf.keras.callbacks.ReduceLROnPlateau(),
            tf.keras.callbacks.ModelCheckpoint('tf_simple_lstm_large_noise_more_epochs_bpps_large_new_loss_815.h5')
        ],
        verbose=2,
    )
    model.load_weights('./tf_simple_lstm_large_noise_more_epochs_bpps_large_new_loss_815.h5')
    outputs = model.predict([vld_inputs, vld_inputs_bpps])
    oofs_pred[vld_idx] = outputs
    
    from sklearn.metrics import mean_squared_error
    errors = []
    for idx in range(5):
         errors.append(np.sqrt(mean_squared_error(vld_labels[:, idx], outputs[:, idx])))
    final_error = np.mean(errors)
    print('#'*20, final_error)

    public_df = test.query("seq_length == 107").copy()
    private_df = test.query("seq_length == 130").copy()

    public_inputs = preprocess_inputs(public_df)
    private_inputs = preprocess_inputs(private_df)
    
    public_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in public_df['id']])
    public_bpps = public_bpps[:, :, np.newaxis]
    
    private_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in private_df['id']])
    private_bpps = private_bpps[:, :, np.newaxis] 

    # Caveat: The prediction format requires the output to be the same length as the input,
    # although it's not the case for the training data.
    model_short = build_model(seq_len=107, pred_len=107)
    model_long = build_model(seq_len=130, pred_len=130)

    model_short.load_weights('tf_simple_lstm_large_noise_more_epochs_bpps_large_new_loss_815.h5')
    model_long.load_weights('tf_simple_lstm_large_noise_more_epochs_bpps_large_new_loss_815.h5')

    public_preds = model_short.predict([public_inputs, public_bpps])
    private_preds = model_long.predict([private_inputs,private_bpps])
    
    public_preds_array.append(public_preds)
    public_preds_array.append(private_preds)

    print(public_preds.shape, private_preds.shape)

    preds_ls = []

    for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
        for idx, uid in enumerate(df.id):
            single_pred = preds[idx]

            single_df = pd.DataFrame(single_pred, columns=pred_cols)
            single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

            preds_ls.append(single_df)

    preds_df = pd.concat(preds_ls)

    submission = sample_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
    submission.to_csv(f'submission_tf_simple_lstm_large_noise_more_epochs_bpps_large_new_loss_815_{i}.csv', index=False)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 107, 3)]     0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 107, 3, 128)  1792        input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_Reshape (TensorFlow [(None, 107, 384)]   0           embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 107, 1)]     0                                            
_______________________________________________________________________________________

53/53 - 4s - loss: 0.0988 - val_loss: 0.2146
Epoch 92/120
53/53 - 4s - loss: 0.0988 - val_loss: 0.2146
Epoch 93/120
53/53 - 4s - loss: 0.0987 - val_loss: 0.2146
Epoch 94/120
53/53 - 4s - loss: 0.0986 - val_loss: 0.2146
Epoch 95/120
53/53 - 5s - loss: 0.0988 - val_loss: 0.2146
Epoch 96/120
53/53 - 4s - loss: 0.0987 - val_loss: 0.2146
Epoch 97/120
53/53 - 4s - loss: 0.0987 - val_loss: 0.2146
Epoch 98/120
53/53 - 4s - loss: 0.0987 - val_loss: 0.2146
Epoch 99/120
53/53 - 4s - loss: 0.0988 - val_loss: 0.2146
Epoch 100/120
53/53 - 4s - loss: 0.0988 - val_loss: 0.2146
Epoch 101/120
53/53 - 4s - loss: 0.0988 - val_loss: 0.2146
Epoch 102/120
53/53 - 4s - loss: 0.0987 - val_loss: 0.2146
Epoch 103/120
53/53 - 4s - loss: 0.0989 - val_loss: 0.2146
Epoch 104/120
53/53 - 4s - loss: 0.0987 - val_loss: 0.2146
Epoch 105/120
53/53 - 4s - loss: 0.0987 - val_loss: 0.2146
Epoch 106/120
53/53 - 4s - loss: 0.0988 - val_loss: 0.2146
Epoch 107/120
53/53 - 4s - loss: 0.0989 - val_loss: 0.2146
Epoch 108/120
53/53

Epoch 60/120
53/53 - 5s - loss: 0.1042 - val_loss: 0.2118
Epoch 61/120
53/53 - 4s - loss: 0.1043 - val_loss: 0.2119
Epoch 62/120
53/53 - 4s - loss: 0.1041 - val_loss: 0.2119
Epoch 63/120
53/53 - 4s - loss: 0.1041 - val_loss: 0.2118
Epoch 64/120
53/53 - 4s - loss: 0.1039 - val_loss: 0.2118
Epoch 65/120
53/53 - 4s - loss: 0.1041 - val_loss: 0.2118
Epoch 66/120
53/53 - 4s - loss: 0.1040 - val_loss: 0.2118
Epoch 67/120
53/53 - 4s - loss: 0.1039 - val_loss: 0.2118
Epoch 68/120
53/53 - 4s - loss: 0.1041 - val_loss: 0.2118
Epoch 69/120
53/53 - 4s - loss: 0.1040 - val_loss: 0.2118
Epoch 70/120
53/53 - 4s - loss: 0.1040 - val_loss: 0.2118
Epoch 71/120
53/53 - 4s - loss: 0.1040 - val_loss: 0.2118
Epoch 72/120
53/53 - 4s - loss: 0.1039 - val_loss: 0.2118
Epoch 73/120
53/53 - 4s - loss: 0.1040 - val_loss: 0.2118
Epoch 74/120
53/53 - 4s - loss: 0.1038 - val_loss: 0.2118
Epoch 75/120
53/53 - 4s - loss: 0.1038 - val_loss: 0.2118
Epoch 76/120
53/53 - 4s - loss: 0.1039 - val_loss: 0.2118
Epoch 77/120
5

Epoch 29/120
53/53 - 4s - loss: 0.1539 - val_loss: 0.2187
Epoch 30/120
53/53 - 4s - loss: 0.1504 - val_loss: 0.2206
Epoch 31/120
53/53 - 5s - loss: 0.1469 - val_loss: 0.2196
Epoch 32/120
53/53 - 4s - loss: 0.1448 - val_loss: 0.2198
Epoch 33/120
53/53 - 4s - loss: 0.1420 - val_loss: 0.2204
Epoch 34/120
53/53 - 4s - loss: 0.1393 - val_loss: 0.2202
Epoch 35/120
53/53 - 4s - loss: 0.1369 - val_loss: 0.2197
Epoch 36/120
53/53 - 4s - loss: 0.1347 - val_loss: 0.2188
Epoch 37/120
53/53 - 4s - loss: 0.1322 - val_loss: 0.2190
Epoch 38/120
53/53 - 4s - loss: 0.1300 - val_loss: 0.2187
Epoch 39/120
53/53 - 4s - loss: 0.1278 - val_loss: 0.2205
Epoch 40/120
53/53 - 4s - loss: 0.1203 - val_loss: 0.2175
Epoch 41/120
53/53 - 4s - loss: 0.1163 - val_loss: 0.2170
Epoch 42/120
53/53 - 4s - loss: 0.1143 - val_loss: 0.2167
Epoch 43/120
53/53 - 4s - loss: 0.1130 - val_loss: 0.2169
Epoch 44/120
53/53 - 4s - loss: 0.1120 - val_loss: 0.2169
Epoch 45/120
53/53 - 4s - loss: 0.1113 - val_loss: 0.2171
Epoch 46/120
5

53/53 - 6s - loss: 0.4347 - val_loss: 0.3762
Epoch 2/120
53/53 - 4s - loss: 0.3595 - val_loss: 0.3458
Epoch 3/120
53/53 - 4s - loss: 0.3385 - val_loss: 0.3314
Epoch 4/120
53/53 - 4s - loss: 0.3241 - val_loss: 0.3170
Epoch 5/120
53/53 - 5s - loss: 0.3118 - val_loss: 0.3047
Epoch 6/120
53/53 - 4s - loss: 0.2975 - val_loss: 0.2920
Epoch 7/120
53/53 - 4s - loss: 0.2854 - val_loss: 0.2775
Epoch 8/120
53/53 - 4s - loss: 0.2697 - val_loss: 0.2701
Epoch 9/120
53/53 - 4s - loss: 0.2595 - val_loss: 0.2576
Epoch 10/120
53/53 - 4s - loss: 0.2491 - val_loss: 0.2493
Epoch 11/120
53/53 - 4s - loss: 0.2424 - val_loss: 0.2463
Epoch 12/120
53/53 - 4s - loss: 0.2348 - val_loss: 0.2375
Epoch 13/120
53/53 - 4s - loss: 0.2285 - val_loss: 0.2361
Epoch 14/120
53/53 - 5s - loss: 0.2237 - val_loss: 0.2336
Epoch 15/120
53/53 - 4s - loss: 0.2167 - val_loss: 0.2307
Epoch 16/120
53/53 - 4s - loss: 0.2123 - val_loss: 0.2299
Epoch 17/120
53/53 - 4s - loss: 0.2076 - val_loss: 0.2281
Epoch 18/120
53/53 - 4s - loss: 0.2

53/53 - 6s - loss: 0.4396 - val_loss: 0.3765
Epoch 2/120
53/53 - 4s - loss: 0.3591 - val_loss: 0.3451
Epoch 3/120
53/53 - 4s - loss: 0.3377 - val_loss: 0.3285
Epoch 4/120
53/53 - 4s - loss: 0.3213 - val_loss: 0.3204
Epoch 5/120
53/53 - 4s - loss: 0.3103 - val_loss: 0.3003
Epoch 6/120
53/53 - 4s - loss: 0.2938 - val_loss: 0.2873
Epoch 7/120
53/53 - 4s - loss: 0.2799 - val_loss: 0.2764
Epoch 8/120
53/53 - 4s - loss: 0.2665 - val_loss: 0.2631
Epoch 9/120
53/53 - 5s - loss: 0.2574 - val_loss: 0.2570
Epoch 10/120
53/53 - 4s - loss: 0.2473 - val_loss: 0.2495
Epoch 11/120
53/53 - 4s - loss: 0.2393 - val_loss: 0.2448
Epoch 12/120
53/53 - 4s - loss: 0.2336 - val_loss: 0.2395
Epoch 13/120
53/53 - 4s - loss: 0.2277 - val_loss: 0.2362
Epoch 14/120
53/53 - 4s - loss: 0.2213 - val_loss: 0.2316
Epoch 15/120
53/53 - 4s - loss: 0.2173 - val_loss: 0.2296
Epoch 16/120
53/53 - 4s - loss: 0.2104 - val_loss: 0.2251
Epoch 17/120
53/53 - 4s - loss: 0.2044 - val_loss: 0.2300
Epoch 18/120
53/53 - 4s - loss: 0.1

In [17]:
# for i, uid in enumerate(train.id):
#     single_pred = oofs_pred[i]

#     oof_df = pd.DataFrame(single_pred, columns=pred_cols)
#     oof_df['id_seqpos'] = [f'{uid}_{x}' for x in range(oof_df.shape[0])]