In [114]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import json
from sklearn.model_selection import train_test_split

In [236]:
MAX_PHRASE_LENGTH = 128
MASK_TOKEN = 60
N_UNIQUE_CHARACTERS = 60 + 1 # for mask token
EMBED_DIM = 128
PAD_TOKEN = 59
BATCH_SIZE = 64

In [244]:
with open ("../data/character_to_prediction_index.json", "r") as f:
    char_to_num = json.load(f)
num_to_char = {j:i for i,j in char_to_num.items()}

In [245]:
data = np.load("../data/y.npy")[:, :MAX_PHRASE_LENGTH]
data.shape

(61955, 128)

In [246]:
data[0]

array([18,  0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36, 61, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59], dtype=int8)

In [247]:
data[data == 61] = 59
data[0]

array([18,  0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
       59, 59, 59, 59, 59, 59, 59, 59, 59], dtype=int8)

In [96]:
class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, dim=256, num_heads=4, dropout=0, **kwargs):
        super().__init__(**kwargs)
        self.dim = dim
        self.scale = self.dim ** -0.5
        self.num_heads = num_heads
        self.qkv = tf.keras.layers.Dense(3 * dim, use_bias=False)
        self.drop1 = tf.keras.layers.Dropout(dropout)
        self.proj = tf.keras.layers.Dense(dim, use_bias=False)
        self.supports_masking = True

    def call(self, inputs, mask=None):
        qkv = self.qkv(inputs)
        qkv = tf.keras.layers.Permute((2, 1, 3))(tf.keras.layers.Reshape((-1, self.num_heads, self.dim * 3 // self.num_heads))(qkv))
        q, k, v = tf.split(qkv, [self.dim // self.num_heads] * 3, axis=-1)

        attn = tf.matmul(q, k, transpose_b=True) * self.scale

        if mask is not None:
            mask = mask[:, None, None, :]

        attn = tf.keras.layers.Softmax(axis=-1)(attn, mask=mask)
        attn = self.drop1(attn)

        x = attn @ v
        x = tf.keras.layers.Reshape((-1, self.dim))(tf.keras.layers.Permute((2, 1, 3))(x))
        x = self.proj(x)
        return x


def TransformerBlock(dim=256, num_heads=4, expand=4, attn_dropout=0.2, drop_rate=0.2, activation='swish'):
    def apply(inputs):
        x = inputs
        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
        x = MultiHeadSelfAttention(dim=dim,num_heads=num_heads,dropout=attn_dropout)(x)
        x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1))(x)
        x = tf.keras.layers.Add()([inputs, x])
        attn_out = x

        x = tf.keras.layers.LayerNormalization(epsilon=1e-6)(x)
        x = tf.keras.layers.Dense(dim*expand, use_bias=False, activation=activation)(x)
        x = tf.keras.layers.Dense(dim, use_bias=False)(x)
        x = tf.keras.layers.Dropout(drop_rate, noise_shape=(None,1,1))(x)
        x = tf.keras.layers.Add()([attn_out, x])
        return x
    return apply

In [97]:
INPUT_SHAPE = [128]

In [98]:
def get_pos_encoding_matrix(max_len, d_emb):
    pos_enc = np.array(
        [
            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
            if pos != 0
            else np.zeros(d_emb)
            for pos in range(max_len)
        ]
    )
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc

In [259]:
# Create Initial Loss Weights All Set To 1
loss_weights = np.ones(N_UNIQUE_CHARACTERS-1, dtype=np.float32)
# Set Loss Weight Of Pad Token To 0
loss_weights[PAD_TOKEN] = 0
def scce_with_ls(y_true, y_pred):
    # Filter Pad Tokens
    idxs = tf.where(y_true != PAD_TOKEN)
    y_true = tf.gather_nd(y_true, idxs)
    y_pred = tf.gather_nd(y_pred, idxs)
    # One Hot Encode Sparsely Encoded Target Sign
    y_true = tf.cast(y_true, tf.int32)
    y_true = tf.one_hot(y_true, N_UNIQUE_CHARACTERS-1, axis=1)
    # Categorical Crossentropy with native label smoothing support
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred, label_smoothing=0.25, from_logits=True)
    loss = tf.math.reduce_mean(loss)
    return loss

In [260]:
# inp[T] --> out[1], where K is hyperparameter that means how much inputs we wanna mask
def get_model(num_blocks):

    inp = tf.keras.Input(INPUT_SHAPE, name="ctc_masked_out")

    char_embedding = tf.keras.layers.Embedding(
        N_UNIQUE_CHARACTERS, EMBED_DIM, name="char_embedding"
    )(inp)

    position_embedding = position_embeddings = tf.keras.layers.Embedding(
        input_dim=MAX_PHRASE_LENGTH,
        output_dim=EMBED_DIM,
        weights=[get_pos_encoding_matrix(MAX_PHRASE_LENGTH, EMBED_DIM)],
        name="position_embedding",
    )(tf.range(start=0, limit=MAX_PHRASE_LENGTH, delta=1))
    
    x = char_embedding + position_embedding #tf.keras.layers.Add()([char_embedding, position_embedding])
    for _ in range(num_blocks):
        x = TransformerBlock(dim=EMBED_DIM)(x)

    x = tf.keras.layers.Dense(N_UNIQUE_CHARACTERS-1, name="classifier", activation="softmax")(x)# -1 since we don't wanna predict mask token

    outputs = x
    
    model = tf.keras.Model(inp, outputs)

    loss = scce_with_ls

    optimizer = tfa.optimizers.RectifiedAdam(sma_threshold=4)
    optimizer = tfa.optimizers.Lookahead(optimizer, sync_period=5)

    model.compile(loss=loss, optimizer=optimizer)

    return model

model = get_model(6)
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 ctc_masked_out (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 char_embedding (Embedding)  (None, 128, 128)             7808      ['ctc_masked_out[0][0]']      
                                                                                                  
 tf.__operators__.add_13 (T  (None, 128, 128)             0         ['char_embedding[0][0]']      
 FOpLambda)                                                                                       
                                                                                            

## сначала восстанавливать те, в которых не уверен

In [121]:
def decode(out):
    answer = [num_to_char[x] if x in num_to_char else '' for x in out]
    return ''.join(answer)

In [122]:
decode(labels[0])

'3 creekhouse'

In [248]:
labels = np.copy(data)

In [249]:
for i in range(len(data)):
    curr = data[i]
    length = len(curr[curr<PAD_TOKEN])
    idx = np.random.randint(0, length)
    data[i, idx] = MASK_TOKEN
data[0], labels[0]

(array([18,  0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 60, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59], dtype=int8),
 array([18,  0, 34, 49, 36, 36, 42, 39, 46, 52, 50, 36, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
        59, 59, 59, 59, 59, 59

In [250]:
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.1, random_state=42)
X_train.shape, X_val.shape

((55759, 128), (6196, 128))

In [251]:
train_classifier_ds = (
    tf.data.Dataset.from_tensor_slices((X_train, y_train))
    .shuffle(1000)
    .batch(BATCH_SIZE)
)

# We have 25000 examples for testing
test_classifier_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(
    BATCH_SIZE
)

In [261]:
model.fit(train_classifier_ds, validation_data=test_classifier_ds, epochs=5)

Epoch 1/5
 20/872 [..............................] - ETA: 8:13 - loss: 4.0918

KeyboardInterrupt: 

In [243]:
next(iter(train_classifier_ds))

(<tf.Tensor: shape=(64, 128), dtype=int8, numpy=
 array([[20, 17, 19, ..., 59, 59, 59],
        [44, 36, 51, ..., 59, 59, 59],
        [54, 54, 54, ..., 59, 59, 59],
        ...,
        [20, 24, 15, ..., 59, 59, 59],
        [49, 38, 12, ..., 59, 59, 59],
        [17, 21, 20, ..., 59, 59, 59]], dtype=int8)>,
 <tf.Tensor: shape=(64,), dtype=int8, numpy=
 array([ 0, 44, 51, 18, 34, 43, 36, 47, 45,  0, 46, 46, 15, 49, 51, 33, 46,
        43, 23, 12, 51, 40, 44, 23, 52, 32, 23, 20, 50, 21, 14, 54, 44, 21,
        19, 49, 50, 20, 51, 38, 21, 46, 15, 49, 17, 34, 12, 36, 14, 50, 36,
        12, 21, 49, 46, 23, 49, 21, 19, 54, 22, 50, 35,  0], dtype=int8)>)