In [1]:
import pandas as pd
import numpy as np
import json
import tensorflow.keras.layers as L
import tensorflow as tf
import plotly.express as px

## Define helper functions and useful vars

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [3]:
# This will tell us the columns we are predicting
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [4]:
def gru_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.GRU(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.LSTM(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)  
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]   
    return tf.cast(pos_encoding, dtype=tf.float32)

def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates


def build_model(gru=False,seq_len=107, pred_len=68, dropout=0.25,
                embed_dim=128, hidden_dim=384):
    
    inputs = tf.keras.layers.Input(shape=(seq_len, 3))
    inputs_bpps = tf.keras.layers.Input(shape=(seq_len, 1))
    
    pos_encoding = positional_encoding(seq_len, hidden_dim)
    

    embed0 = tf.keras.layers.Embedding(input_dim=len(token2int0), output_dim=embed_dim)(inputs[:, :, 0])
    embed1 = tf.keras.layers.Embedding(input_dim=len(token2int1), output_dim=embed_dim)(inputs[:, :, 1])
    embed2 = tf.keras.layers.Embedding(input_dim=len(token2int2), output_dim=embed_dim)(inputs[:, :, 2])
    
    
    embed0 = tf.keras.layers.SpatialDropout1D(.2)(embed0)
    embed1 = tf.keras.layers.SpatialDropout1D(.2)(embed1)
    embed2 = tf.keras.layers.SpatialDropout1D(.2)(embed2)
    
    embed = tf.concat([embed0, embed1, embed2], axis=2)
    embed += pos_encoding
    
    #reshaped = tf.reshape(
    #    embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))
    
    embed = tf.keras.layers.SpatialDropout1D(.2)(embed)
    bpps = tf.keras.layers.Dense(embed_dim, activation='linear')(inputs_bpps)
    
    embed = tf.concat([embed, bpps], axis=2)
    
    transformer_block = TransformerBlock(512, 8, 512)
    embed = transformer_block(embed)
    
    hidden = gru_layer(hidden_dim, dropout)(embed)
    hidden = gru_layer(hidden_dim, dropout)(hidden)
    hidden = gru_layer(hidden_dim, dropout)(hidden)

    
    #only making predictions on the first part of each sequence
    truncated = hidden[:, :pred_len]
    
    out1 = tf.keras.layers.Dense(5, activation='linear', name='out1')(truncated)
    out2 = tf.keras.layers.Dense(5, activation='linear', name='out2')(truncated)

    model = tf.keras.Model(inputs=[inputs, inputs_bpps], outputs=[out1, out2])

    #some optimizers
    adam = tf.optimizers.Adam()
    def MCRMSE(y_true, y_pred):
        colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
        return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)
    
    model.compile(optimizer = adam, loss={'out1': MCRMSE, 'out2': 'mae'}, loss_weights={'out1': 0.7, 'out2': 0.3})
    
    return model

In [5]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

## Load and preprocess data

In [6]:
train = pd.read_json('../input//train.json', lines=True)
test = pd.read_json('../input//test.json', lines=True)
sample_df = pd.read_csv('../input//sample_submission.csv')

In [7]:
#target columns
target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [8]:
token2int0 = {'G': 0, 'A': 1, 'C': 2, 'U': 3}
token2int1 = {'.': 0,  '(': 1, ')': 2}
token2int2 = {'E': 0, 'S': 1, 'H': 2, 'B': 3, 'X': 4, 'I': 5, 'M': 6}

def convert_seq(x, tmp_dict):
    return [tmp_dict[ele] for ele in x]

train['sequence'] = train['sequence'].apply(lambda x: [token2int0[ele] for ele in x])
train['structure'] = train['structure'].apply(lambda x: [token2int1[ele] for ele in x])
train['predicted_loop_type'] = train['predicted_loop_type'].apply(lambda x: [token2int2[ele] for ele in x])
train_inputs = np.transpose(np.array(train[['sequence', 'structure', 'predicted_loop_type']].values.tolist()), (0, 2, 1))

train_inputs = train_inputs[train.signal_to_noise > 1]
train_labels = np.array(train[train.signal_to_noise > 1][target_cols].values.tolist()).transpose((0, 2, 1))

In [9]:
train_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in train['id']])
train_bpps = train_bpps[train.signal_to_noise > 1][:, :, np.newaxis]

In [10]:
from sklearn.model_selection import KFold

In [11]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output
    
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        config = super().get_config().copy()
        return config
    
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [12]:
train_labels.shape

(2096, 68, 5)

In [13]:
np.zeros_like(train_labels).shape

(2096, 68, 5)

In [14]:
FOLDS = KFold(n_splits=5, random_state=815, shuffle=True)

oofs_pred = np.zeros_like(train_labels)
public_preds_array = []
public_preds_array = []

for i, (trn_idx, vld_idx) in enumerate(FOLDS.split(train_inputs)):
    trn_inputs = train_inputs[trn_idx]
    vld_inputs = train_inputs[vld_idx]
    
    trn_inputs_bpps = train_bpps[trn_idx]
    vld_inputs_bpps = train_bpps[vld_idx]

    trn_labels = train_labels[trn_idx]
    vld_labels = train_labels[vld_idx]

    model = build_model()
    model.summary()

    history = model.fit(
        [trn_inputs, trn_inputs_bpps], trn_labels, 
        validation_data=([vld_inputs, vld_inputs_bpps], vld_labels),
        batch_size=32,
        epochs=120,
        callbacks=[
            tf.keras.callbacks.ReduceLROnPlateau(),
            tf.keras.callbacks.ModelCheckpoint('tf_simple_lstm_large_noise_more_epochs_bpps_large_new_loss_transformer_threeEmbedding_gru_twoloss_posEnc_BeforeConcat_815.h5')
        ],
        verbose=2,
    )
    model.load_weights('./tf_simple_lstm_large_noise_more_epochs_bpps_large_new_loss_transformer_threeEmbedding_gru_twoloss_posEnc_BeforeConcat_815.h5')
    outputs, outputs2 = model.predict([vld_inputs, vld_inputs_bpps])
    oofs_pred[vld_idx] = outputs
    
    from sklearn.metrics import mean_squared_error
    errors = []
    for idx in range(5):
         errors.append(np.sqrt(mean_squared_error(vld_labels[:, idx], outputs[:, idx])))
    final_error = np.mean(errors)
    print('#'*20, final_error)

    public_df = test.query("seq_length == 107").copy()
    private_df = test.query("seq_length == 130").copy()
    
    public_df['sequence'] = public_df['sequence'].apply(lambda x: [token2int0[ele] for ele in x])
    public_df['structure'] = public_df['structure'].apply(lambda x: [token2int1[ele] for ele in x])
    public_df['predicted_loop_type'] = public_df['predicted_loop_type'].apply(lambda x: [token2int2[ele] for ele in x])
    public_inputs = np.transpose(np.array(public_df[['sequence', 'structure', 'predicted_loop_type']].values.tolist()), (0, 2, 1))

    private_df['sequence'] = private_df['sequence'].apply(lambda x: [token2int0[ele] for ele in x])
    private_df['structure'] = private_df['structure'].apply(lambda x: [token2int1[ele] for ele in x])
    private_df['predicted_loop_type'] = private_df['predicted_loop_type'].apply(lambda x: [token2int2[ele] for ele in x])
    private_inputs = np.transpose(np.array(private_df[['sequence', 'structure', 'predicted_loop_type']].values.tolist()), (0, 2, 1))

    public_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in public_df['id']])
    public_bpps = public_bpps[:, :, np.newaxis]
    
    private_bpps = np.stack([1 - np.load(f'../input/bpps/{ele}.npy').sum(1) for ele in private_df['id']])
    private_bpps = private_bpps[:, :, np.newaxis] 

    # Caveat: The prediction format requires the output to be the same length as the input,
    # although it's not the case for the training data.
    model_short = build_model(seq_len=107, pred_len=107)
    model_long = build_model(seq_len=130, pred_len=130)

    model_short.load_weights('tf_simple_lstm_large_noise_more_epochs_bpps_large_new_loss_transformer_threeEmbedding_gru_twoloss_posEnc_BeforeConcat_815.h5')
    model_long.load_weights('tf_simple_lstm_large_noise_more_epochs_bpps_large_new_loss_transformer_threeEmbedding_gru_twoloss_posEnc_BeforeConcat_815.h5')

    public_preds, outputs2 = model_short.predict([public_inputs, public_bpps])
    private_preds, outputs2 = model_long.predict([private_inputs,private_bpps])
    
    public_preds_array.append(public_preds)
    public_preds_array.append(private_preds)

    print(public_preds.shape, private_preds.shape)

    preds_ls = []

    for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
        for idx, uid in enumerate(df.id):
            single_pred = preds[idx]

            single_df = pd.DataFrame(single_pred, columns=pred_cols)
            single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

            preds_ls.append(single_df)

    preds_df = pd.concat(preds_ls)

    submission = sample_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
    submission.to_csv(f'submission_tf_simple_lstm_large_noise_more_epochs_bpps_large_new_loss_transformer_threeEmbedding_gru_twoloss_posEnc_BeforeConcat_815_{i}.csv', index=False)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 107, 3)]     0                                            
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [(None, 107)]        0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_1 (Te [(None, 107)]        0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice_2 (Te [(None, 107)]        0           input_1[0][0]                    
_______________________________________________________________________________________

Epoch 21/120
53/53 - 5s - loss: 0.2077 - out1_loss: 0.2305 - out2_loss: 0.1546 - val_loss: 0.2126 - val_out1_loss: 0.2355 - val_out2_loss: 0.1591
Epoch 22/120
53/53 - 5s - loss: 0.2051 - out1_loss: 0.2275 - out2_loss: 0.1528 - val_loss: 0.2118 - val_out1_loss: 0.2353 - val_out2_loss: 0.1570
Epoch 23/120
53/53 - 5s - loss: 0.2029 - out1_loss: 0.2252 - out2_loss: 0.1510 - val_loss: 0.2110 - val_out1_loss: 0.2342 - val_out2_loss: 0.1568
Epoch 24/120
53/53 - 5s - loss: 0.1981 - out1_loss: 0.2196 - out2_loss: 0.1479 - val_loss: 0.2131 - val_out1_loss: 0.2362 - val_out2_loss: 0.1591
Epoch 25/120
53/53 - 5s - loss: 0.1967 - out1_loss: 0.2182 - out2_loss: 0.1464 - val_loss: 0.2072 - val_out1_loss: 0.2302 - val_out2_loss: 0.1535
Epoch 26/120
53/53 - 4s - loss: 0.1940 - out1_loss: 0.2146 - out2_loss: 0.1457 - val_loss: 0.2098 - val_out1_loss: 0.2314 - val_out2_loss: 0.1595
Epoch 27/120
53/53 - 5s - loss: 0.1900 - out1_loss: 0.2101 - out2_loss: 0.1428 - val_loss: 0.2049 - val_out1_loss: 0.2262 - 

Epoch 78/120
53/53 - 5s - loss: 0.1229 - out1_loss: 0.1337 - out2_loss: 0.0977 - val_loss: 0.1908 - val_out1_loss: 0.2121 - val_out2_loss: 0.1409
Epoch 79/120
53/53 - 5s - loss: 0.1228 - out1_loss: 0.1336 - out2_loss: 0.0977 - val_loss: 0.1907 - val_out1_loss: 0.2121 - val_out2_loss: 0.1409
Epoch 80/120
53/53 - 5s - loss: 0.1229 - out1_loss: 0.1336 - out2_loss: 0.0978 - val_loss: 0.1908 - val_out1_loss: 0.2121 - val_out2_loss: 0.1409
Epoch 81/120
53/53 - 5s - loss: 0.1226 - out1_loss: 0.1334 - out2_loss: 0.0976 - val_loss: 0.1907 - val_out1_loss: 0.2120 - val_out2_loss: 0.1408
Epoch 82/120
53/53 - 5s - loss: 0.1227 - out1_loss: 0.1334 - out2_loss: 0.0977 - val_loss: 0.1908 - val_out1_loss: 0.2122 - val_out2_loss: 0.1409
Epoch 83/120
53/53 - 5s - loss: 0.1225 - out1_loss: 0.1333 - out2_loss: 0.0975 - val_loss: 0.1907 - val_out1_loss: 0.2121 - val_out2_loss: 0.1408
Epoch 84/120
53/53 - 4s - loss: 0.1224 - out1_loss: 0.1331 - out2_loss: 0.0975 - val_loss: 0.1907 - val_out1_loss: 0.2121 - 

53/53 - 7s - loss: 0.6950 - out1_loss: 0.7698 - out2_loss: 0.5204 - val_loss: 0.4030 - val_out1_loss: 0.4370 - val_out2_loss: 0.3239
Epoch 2/120
53/53 - 4s - loss: 0.3928 - out1_loss: 0.4316 - out2_loss: 0.3022 - val_loss: 0.3675 - val_out1_loss: 0.4066 - val_out2_loss: 0.2762
Epoch 3/120
53/53 - 4s - loss: 0.3531 - out1_loss: 0.3922 - out2_loss: 0.2616 - val_loss: 0.3474 - val_out1_loss: 0.3862 - val_out2_loss: 0.2570
Epoch 4/120
53/53 - 5s - loss: 0.3386 - out1_loss: 0.3751 - out2_loss: 0.2534 - val_loss: 0.3397 - val_out1_loss: 0.3766 - val_out2_loss: 0.2536
Epoch 5/120
53/53 - 4s - loss: 0.3263 - out1_loss: 0.3628 - out2_loss: 0.2411 - val_loss: 0.3151 - val_out1_loss: 0.3517 - val_out2_loss: 0.2297
Epoch 6/120
53/53 - 4s - loss: 0.3112 - out1_loss: 0.3473 - out2_loss: 0.2268 - val_loss: 0.2963 - val_out1_loss: 0.3303 - val_out2_loss: 0.2169
Epoch 7/120
53/53 - 4s - loss: 0.2973 - out1_loss: 0.3309 - out2_loss: 0.2190 - val_loss: 0.2793 - val_out1_loss: 0.3112 - val_out2_loss: 0.20

Epoch 58/120
53/53 - 4s - loss: 0.1358 - out1_loss: 0.1483 - out2_loss: 0.1066 - val_loss: 0.1889 - val_out1_loss: 0.2106 - val_out2_loss: 0.1384
Epoch 59/120
53/53 - 4s - loss: 0.1329 - out1_loss: 0.1450 - out2_loss: 0.1047 - val_loss: 0.1886 - val_out1_loss: 0.2102 - val_out2_loss: 0.1381
Epoch 60/120
53/53 - 5s - loss: 0.1320 - out1_loss: 0.1439 - out2_loss: 0.1041 - val_loss: 0.1885 - val_out1_loss: 0.2101 - val_out2_loss: 0.1379
Epoch 61/120
53/53 - 4s - loss: 0.1312 - out1_loss: 0.1430 - out2_loss: 0.1036 - val_loss: 0.1883 - val_out1_loss: 0.2099 - val_out2_loss: 0.1379
Epoch 62/120
53/53 - 5s - loss: 0.1306 - out1_loss: 0.1424 - out2_loss: 0.1033 - val_loss: 0.1882 - val_out1_loss: 0.2098 - val_out2_loss: 0.1377
Epoch 63/120
53/53 - 5s - loss: 0.1302 - out1_loss: 0.1419 - out2_loss: 0.1029 - val_loss: 0.1881 - val_out1_loss: 0.2097 - val_out2_loss: 0.1376
Epoch 64/120
53/53 - 5s - loss: 0.1298 - out1_loss: 0.1414 - out2_loss: 0.1027 - val_loss: 0.1880 - val_out1_loss: 0.2096 - 

53/53 - 5s - loss: 0.1246 - out1_loss: 0.1355 - out2_loss: 0.0992 - val_loss: 0.1874 - val_out1_loss: 0.2090 - val_out2_loss: 0.1370
Epoch 115/120
53/53 - 4s - loss: 0.1244 - out1_loss: 0.1353 - out2_loss: 0.0990 - val_loss: 0.1874 - val_out1_loss: 0.2090 - val_out2_loss: 0.1370
Epoch 116/120
53/53 - 5s - loss: 0.1244 - out1_loss: 0.1353 - out2_loss: 0.0990 - val_loss: 0.1874 - val_out1_loss: 0.2090 - val_out2_loss: 0.1370
Epoch 117/120
53/53 - 4s - loss: 0.1246 - out1_loss: 0.1355 - out2_loss: 0.0991 - val_loss: 0.1874 - val_out1_loss: 0.2090 - val_out2_loss: 0.1370
Epoch 118/120
53/53 - 4s - loss: 0.1246 - out1_loss: 0.1355 - out2_loss: 0.0991 - val_loss: 0.1874 - val_out1_loss: 0.2090 - val_out2_loss: 0.1370
Epoch 119/120
53/53 - 4s - loss: 0.1245 - out1_loss: 0.1354 - out2_loss: 0.0990 - val_loss: 0.1874 - val_out1_loss: 0.2090 - val_out2_loss: 0.1370
Epoch 120/120
53/53 - 4s - loss: 0.1247 - out1_loss: 0.1357 - out2_loss: 0.0990 - val_loss: 0.1874 - val_out1_loss: 0.2090 - val_out

Epoch 14/120
53/53 - 5s - loss: 0.2451 - out1_loss: 0.2725 - out2_loss: 0.1812 - val_loss: 0.2386 - val_out1_loss: 0.2655 - val_out2_loss: 0.1757
Epoch 15/120
53/53 - 5s - loss: 0.2373 - out1_loss: 0.2635 - out2_loss: 0.1761 - val_loss: 0.2322 - val_out1_loss: 0.2588 - val_out2_loss: 0.1702
Epoch 16/120
53/53 - 4s - loss: 0.2333 - out1_loss: 0.2586 - out2_loss: 0.1744 - val_loss: 0.2290 - val_out1_loss: 0.2550 - val_out2_loss: 0.1684
Epoch 17/120
53/53 - 5s - loss: 0.2267 - out1_loss: 0.2516 - out2_loss: 0.1686 - val_loss: 0.2246 - val_out1_loss: 0.2505 - val_out2_loss: 0.1641
Epoch 18/120
53/53 - 5s - loss: 0.2205 - out1_loss: 0.2447 - out2_loss: 0.1643 - val_loss: 0.2214 - val_out1_loss: 0.2463 - val_out2_loss: 0.1633
Epoch 19/120
53/53 - 4s - loss: 0.2160 - out1_loss: 0.2398 - out2_loss: 0.1605 - val_loss: 0.2212 - val_out1_loss: 0.2464 - val_out2_loss: 0.1624
Epoch 20/120
53/53 - 5s - loss: 0.2130 - out1_loss: 0.2362 - out2_loss: 0.1587 - val_loss: 0.2151 - val_out1_loss: 0.2398 - 

Epoch 71/120
53/53 - 5s - loss: 0.1292 - out1_loss: 0.1408 - out2_loss: 0.1021 - val_loss: 0.1928 - val_out1_loss: 0.2151 - val_out2_loss: 0.1407
Epoch 72/120
53/53 - 5s - loss: 0.1289 - out1_loss: 0.1404 - out2_loss: 0.1019 - val_loss: 0.1930 - val_out1_loss: 0.2154 - val_out2_loss: 0.1409
Epoch 73/120
53/53 - 5s - loss: 0.1285 - out1_loss: 0.1400 - out2_loss: 0.1018 - val_loss: 0.1931 - val_out1_loss: 0.2154 - val_out2_loss: 0.1410
Epoch 74/120
53/53 - 5s - loss: 0.1285 - out1_loss: 0.1400 - out2_loss: 0.1016 - val_loss: 0.1930 - val_out1_loss: 0.2154 - val_out2_loss: 0.1408
Epoch 75/120
53/53 - 5s - loss: 0.1282 - out1_loss: 0.1396 - out2_loss: 0.1015 - val_loss: 0.1929 - val_out1_loss: 0.2152 - val_out2_loss: 0.1407
Epoch 76/120
53/53 - 4s - loss: 0.1279 - out1_loss: 0.1393 - out2_loss: 0.1013 - val_loss: 0.1930 - val_out1_loss: 0.2153 - val_out2_loss: 0.1409
Epoch 77/120
53/53 - 5s - loss: 0.1278 - out1_loss: 0.1392 - out2_loss: 0.1012 - val_loss: 0.1927 - val_out1_loss: 0.2150 - 

53/53 - 7s - loss: 0.6916 - out1_loss: 0.7423 - out2_loss: 0.5734 - val_loss: 0.3996 - val_out1_loss: 0.4358 - val_out2_loss: 0.3150
Epoch 2/120
53/53 - 5s - loss: 0.3838 - out1_loss: 0.4239 - out2_loss: 0.2903 - val_loss: 0.3566 - val_out1_loss: 0.3972 - val_out2_loss: 0.2619
Epoch 3/120
53/53 - 4s - loss: 0.3508 - out1_loss: 0.3896 - out2_loss: 0.2603 - val_loss: 0.3502 - val_out1_loss: 0.3908 - val_out2_loss: 0.2555
Epoch 4/120
53/53 - 4s - loss: 0.3352 - out1_loss: 0.3720 - out2_loss: 0.2495 - val_loss: 0.3250 - val_out1_loss: 0.3617 - val_out2_loss: 0.2394
Epoch 5/120
53/53 - 5s - loss: 0.3228 - out1_loss: 0.3590 - out2_loss: 0.2385 - val_loss: 0.3240 - val_out1_loss: 0.3572 - val_out2_loss: 0.2467
Epoch 6/120
53/53 - 5s - loss: 0.3066 - out1_loss: 0.3420 - out2_loss: 0.2241 - val_loss: 0.3017 - val_out1_loss: 0.3404 - val_out2_loss: 0.2113
Epoch 7/120
53/53 - 5s - loss: 0.2958 - out1_loss: 0.3298 - out2_loss: 0.2164 - val_loss: 0.2862 - val_out1_loss: 0.3200 - val_out2_loss: 0.20

Epoch 58/120
53/53 - 4s - loss: 0.1411 - out1_loss: 0.1540 - out2_loss: 0.1111 - val_loss: 0.1968 - val_out1_loss: 0.2202 - val_out2_loss: 0.1422
Epoch 59/120
53/53 - 4s - loss: 0.1395 - out1_loss: 0.1525 - out2_loss: 0.1094 - val_loss: 0.1966 - val_out1_loss: 0.2199 - val_out2_loss: 0.1422
Epoch 60/120
53/53 - 5s - loss: 0.1397 - out1_loss: 0.1522 - out2_loss: 0.1104 - val_loss: 0.1967 - val_out1_loss: 0.2198 - val_out2_loss: 0.1429
Epoch 61/120
53/53 - 4s - loss: 0.1391 - out1_loss: 0.1514 - out2_loss: 0.1106 - val_loss: 0.2014 - val_out1_loss: 0.2252 - val_out2_loss: 0.1457
Epoch 62/120
53/53 - 4s - loss: 0.1378 - out1_loss: 0.1505 - out2_loss: 0.1083 - val_loss: 0.1974 - val_out1_loss: 0.2204 - val_out2_loss: 0.1438
Epoch 63/120
53/53 - 4s - loss: 0.1369 - out1_loss: 0.1493 - out2_loss: 0.1079 - val_loss: 0.1969 - val_out1_loss: 0.2200 - val_out2_loss: 0.1431
Epoch 64/120
53/53 - 5s - loss: 0.1357 - out1_loss: 0.1480 - out2_loss: 0.1070 - val_loss: 0.1969 - val_out1_loss: 0.2199 - 

53/53 - 5s - loss: 0.1187 - out1_loss: 0.1287 - out2_loss: 0.0951 - val_loss: 0.1927 - val_out1_loss: 0.2157 - val_out2_loss: 0.1391
Epoch 115/120
53/53 - 5s - loss: 0.1185 - out1_loss: 0.1286 - out2_loss: 0.0951 - val_loss: 0.1928 - val_out1_loss: 0.2157 - val_out2_loss: 0.1391
Epoch 116/120
53/53 - 5s - loss: 0.1187 - out1_loss: 0.1287 - out2_loss: 0.0951 - val_loss: 0.1928 - val_out1_loss: 0.2157 - val_out2_loss: 0.1391
Epoch 117/120
53/53 - 5s - loss: 0.1187 - out1_loss: 0.1287 - out2_loss: 0.0952 - val_loss: 0.1928 - val_out1_loss: 0.2157 - val_out2_loss: 0.1391
Epoch 118/120
53/53 - 4s - loss: 0.1186 - out1_loss: 0.1287 - out2_loss: 0.0951 - val_loss: 0.1928 - val_out1_loss: 0.2157 - val_out2_loss: 0.1391
Epoch 119/120
53/53 - 5s - loss: 0.1185 - out1_loss: 0.1285 - out2_loss: 0.0951 - val_loss: 0.1928 - val_out1_loss: 0.2157 - val_out2_loss: 0.1392
Epoch 120/120
53/53 - 4s - loss: 0.1188 - out1_loss: 0.1289 - out2_loss: 0.0952 - val_loss: 0.1928 - val_out1_loss: 0.2157 - val_out

Epoch 14/120
53/53 - 4s - loss: 0.2504 - out1_loss: 0.2784 - out2_loss: 0.1852 - val_loss: 0.2512 - val_out1_loss: 0.2804 - val_out2_loss: 0.1829
Epoch 15/120
53/53 - 5s - loss: 0.2439 - out1_loss: 0.2713 - out2_loss: 0.1799 - val_loss: 0.2480 - val_out1_loss: 0.2781 - val_out2_loss: 0.1776
Epoch 16/120
53/53 - 4s - loss: 0.2397 - out1_loss: 0.2667 - out2_loss: 0.1767 - val_loss: 0.2436 - val_out1_loss: 0.2747 - val_out2_loss: 0.1710
Epoch 17/120
53/53 - 5s - loss: 0.2350 - out1_loss: 0.2608 - out2_loss: 0.1749 - val_loss: 0.2320 - val_out1_loss: 0.2582 - val_out2_loss: 0.1709
Epoch 18/120
53/53 - 5s - loss: 0.2281 - out1_loss: 0.2529 - out2_loss: 0.1701 - val_loss: 0.2303 - val_out1_loss: 0.2573 - val_out2_loss: 0.1673
Epoch 19/120
53/53 - 4s - loss: 0.2238 - out1_loss: 0.2488 - out2_loss: 0.1653 - val_loss: 0.2253 - val_out1_loss: 0.2507 - val_out2_loss: 0.1661
Epoch 20/120
53/53 - 4s - loss: 0.2148 - out1_loss: 0.2382 - out2_loss: 0.1601 - val_loss: 0.2198 - val_out1_loss: 0.2415 - 

Epoch 71/120
53/53 - 5s - loss: 0.1267 - out1_loss: 0.1379 - out2_loss: 0.1006 - val_loss: 0.1914 - val_out1_loss: 0.2140 - val_out2_loss: 0.1384
Epoch 72/120
53/53 - 4s - loss: 0.1264 - out1_loss: 0.1376 - out2_loss: 0.1004 - val_loss: 0.1915 - val_out1_loss: 0.2142 - val_out2_loss: 0.1386
Epoch 73/120
53/53 - 4s - loss: 0.1261 - out1_loss: 0.1372 - out2_loss: 0.1002 - val_loss: 0.1917 - val_out1_loss: 0.2144 - val_out2_loss: 0.1387
Epoch 74/120
53/53 - 5s - loss: 0.1256 - out1_loss: 0.1367 - out2_loss: 0.0998 - val_loss: 0.1915 - val_out1_loss: 0.2143 - val_out2_loss: 0.1384
Epoch 75/120
53/53 - 4s - loss: 0.1255 - out1_loss: 0.1365 - out2_loss: 0.0998 - val_loss: 0.1916 - val_out1_loss: 0.2144 - val_out2_loss: 0.1384
Epoch 76/120
53/53 - 4s - loss: 0.1252 - out1_loss: 0.1362 - out2_loss: 0.0996 - val_loss: 0.1915 - val_out1_loss: 0.2143 - val_out2_loss: 0.1384
Epoch 77/120
53/53 - 4s - loss: 0.1252 - out1_loss: 0.1363 - out2_loss: 0.0995 - val_loss: 0.1914 - val_out1_loss: 0.2142 - 

In [15]:
# for i, uid in enumerate(train.id):
#     single_pred = oofs_pred[i]

#     oof_df = pd.DataFrame(single_pred, columns=pred_cols)
#     oof_df['id_seqpos'] = [f'{uid}_{x}' for x in range(oof_df.shape[0])]

- Gru 가 들어가면 좋은게 long term corelation 이 있는 것 아닐까...꼬이고 하니까
- Positional encoding 넣으면 확 뛸거 같은디