# Infer Transformer


The training code is [here](https://www.kaggle.com/takamichitoda/ump-train-transformer-on-tpu?scriptVersionId=86363969), and standerd scaler model is [here](https://www.kaggle.com/takamichitoda/ump-npy-dataset).

In [None]:
import gc
import pickle
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import ubiquant

device_name = tf.test.gpu_device_name()
if "GPU" not in device_name:
    print("GPU device not found")
print('Found GPU at: {}'.format(device_name))

In [None]:
class GCF:
    MODEL_ROOT = "/kaggle/input/ump-train-transformer-on-tpu"
    N_FOLDS = 5
    FEAT_COLS = [f"f_{i}" for i in range(300)]
    SCALER_PATH = "/kaggle/input/ump-npy-dataset/std_scaler.pkl"
    
    # Transformer Parameters
    EMBED_DIM = 64//2
    N_HEAD = 8
    FF_DIM = 128//2
    DROPOUT = 0.0
    N_BLOCK = 4

In [None]:
feat_dim = 300
# https://www.kaggle.com/pratikskarnik/riiid-keras-transformer-starter
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8, **kwargs):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output
    
    def get_config(self):
        config = {
            "embed_dim" : self.embed_dim,
            "num_heads" : self.num_heads,
            "projection_dim" : self.projection_dim,
            "query_dense" : self.query_dense,
            "key_dense" : self.key_dense,
            "value_dense" : self.value_dense,
            "combine_heads" : self.combine_heads,
        }
        base_config = super(MultiHeadSelfAttention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))  


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim=GCF.EMBED_DIM, feat_dim=feat_dim, num_heads=GCF.N_HEAD, ff_dim=GCF.FF_DIM, rate=GCF.DROPOUT, **kwargs):
        super(TransformerBlock, self).__init__()
        #self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.att = MultiHeadSelfAttention(num_heads=num_heads, embed_dim=embed_dim)
        self.ffn = keras.Sequential(
            #[layers.Dense(ff_dim, activation="gelu"), layers.Dense(feat_dim),]
            [layers.Dense(ff_dim, activation="gelu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        #attn_output = self.att(inputs, inputs)
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        config = {
            "att" : self.att,
            "ffn" : self.ffn,
            "layernorm1" : self.layernorm1,
            "layernorm2" : self.layernorm2,
            "dropout1" : self.dropout1,
            "dropout2" : self.dropout2,
        }
        base_config = super(TransformerBlock, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
model = tf.keras.models.load_model(f"{GCF.MODEL_ROOT}/ump_transformer_all_train.h5", compile=False,
                                   custom_objects={
                                       "MultiHeadSelfAttention": MultiHeadSelfAttention,
                                       "TransformerBlock": TransformerBlock,
                                   })
model.summary()

In [None]:
scaler = pickle.load(open(GCF.SCALER_PATH, "rb"))
scaler

In [None]:
%%time
env = ubiquant.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test set and sample submission
for (test_df, sample_prediction_df) in iter_test:
    x = scaler.transform(test_df[GCF.FEAT_COLS].values)
    
    with tf.device('/GPU:0'):
        #pred = model.predict(np.expand_dims(x, axis=1))
        pred = np.stack([model(np.expand_dims(x, axis=1), training=True).numpy() for _ in range(100)]).mean(0)[:, 0]
    
    sample_prediction_df['target'] = pred  # make your predictions here
    env.predict(sample_prediction_df)   # register your predictions

In [None]:
sample_prediction_df