In [20]:
import tensorflow as tf 
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Layer
from tensorflow.keras import Input
from tensorflow.keras import Sequential
from tensorflow.keras import Model
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.layers import LayerNormalization
from tensorflow.keras.callbacks import Callback


dataset = tf.keras.utils.text_dataset_from_directory(
    directory="dataset/aclImdb", label_mode=None, batch_size=16)
dataset = dataset.map(lambda x: tf.strings.regex_replace(x, "<br />", " "))

Found 100006 files belonging to 1 classes.


In [21]:
sequence_length = 100
vocab_size = 15000
text_vectorization = TextVectorization(
 max_tokens=vocab_size, 
 output_mode="int",
 output_sequence_length=sequence_length,
)

text_vectorization.adapt(dataset)


In [22]:
def prepare_lm_dataset(text_batch):
    vectorized_sequences = text_vectorization(text_batch)
    x = vectorized_sequences[:, :-1] 
    y = vectorized_sequences[:, 1:]
    return x, y
 
lm_dataset = dataset.map(prepare_lm_dataset, num_parallel_calls=4)

In [23]:
class PositionalEmbedding(Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = Embedding(
        input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = Embedding(
        input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
    
    def get_config(self):
        config = super().get_config()
        config.update({
        "output_dim": self.output_dim,
        "sequence_length": self.sequence_length,
        "input_dim": self.input_dim,
        })
        return config


In [24]:
class TransformerDecoder(Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = Sequential(
        [Dense(dense_dim, activation="relu"),
        Dense(embed_dim),]
        )
        self.layernorm_1 = LayerNormalization()
        self.layernorm_2 = LayerNormalization()
        self.layernorm_3 = LayerNormalization()
        self.supports_masking = True 

    def get_config(self):
        config = super().get_config()
        config.update({
        "embed_dim": self.embed_dim,
        "num_heads": self.num_heads,
        "dense_dim": self.dense_dim,
        })
        return config
    
    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
        [tf.expand_dims(batch_size, -1),
        tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)
    
    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
            mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)
            attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask) 
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
        query=attention_output_1,
        value=encoder_outputs,
        key=encoder_outputs,
        attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(
        attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

In [26]:
embed_dim = 256
latent_dim = 2048
num_heads = 2
 
    
inputs = Input(shape=(None,), dtype="int64")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, x)
outputs = Dense(vocab_size, activation="softmax")(x)
model = Model(inputs, outputs)
model.compile(loss="sparse_categorical_crossentropy", optimizer="rmsprop")

In [27]:
for i in lm_dataset.take(1):
    print(model(i[0]))

tf.Tensor(
[[[5.6872628e-05 7.7745863e-05 6.0421706e-05 ... 6.7597241e-05
   5.0602208e-05 6.4274202e-05]
  [8.2555176e-05 7.1145092e-05 7.0926937e-05 ... 8.4092244e-05
   5.6250094e-05 6.0516690e-05]
  [4.2434371e-05 6.0013474e-05 6.8193382e-05 ... 6.1311664e-05
   5.8396021e-05 7.2625560e-05]
  ...
  [8.9070498e-05 7.4948475e-05 7.5644319e-05 ... 7.3166972e-05
   6.5437118e-05 9.8309480e-05]
  [6.7717468e-05 7.5312761e-05 8.1382605e-05 ... 7.2559422e-05
   5.5656012e-05 5.9775815e-05]
  [8.0875354e-05 5.5514058e-05 6.4606735e-05 ... 5.3243733e-05
   7.9286787e-05 5.7814676e-05]]

 [[5.1716728e-05 7.8366524e-05 7.6930839e-05 ... 5.3837302e-05
   5.1387127e-05 5.1262221e-05]
  [8.8077264e-05 8.4596832e-05 9.1060028e-05 ... 7.7747274e-05
   7.9454992e-05 4.0142681e-05]
  [5.0106588e-05 5.0674429e-05 7.2785704e-05 ... 5.7358291e-05
   5.5386674e-05 4.9088201e-05]
  ...
  [6.8918343e-05 9.5390904e-05 7.4481286e-05 ... 6.1743151e-05
   7.7927391e-05 7.5718817e-05]
  [5.0077619e-05 6.043408

In [28]:
import numpy as np

tokens_index = dict(enumerate(text_vectorization.get_vocabulary()))
 
def sample_next(predictions, temperature=1.0):
    predictions = np.asarray(predictions).astype("float64")
    predictions = np.log(predictions) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, predictions, 1)
    return np.argmax(probas)
 
class TextGenerator(Callback):
    def __init__(self,
            prompt,
            generate_length,
            model_input_length,
            temperatures=(1.,),
            print_freq=1):
        self.prompt = prompt
        self.generate_length = generate_length
        self.model_input_length = model_input_length
        self.temperatures = temperatures
        self.print_freq = print_freq
 
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.print_freq != 0:
            return
        for temperature in self.temperatures:
            print("== Generating with temperature", temperature)
            sentence = self.prompt
            for i in range(self.generate_length):
                tokenized_sentence = text_vectorization([sentence])
                predictions = self.model(tokenized_sentence)
                next_token = sample_next(predictions[0, i, :])
                sampled_token = tokens_index[next_token]
                sentence += " " + sampled_token
            print(sentence)
prompt = "This movie"
text_gen_callback = TextGenerator(
     prompt,
     generate_length=50,
     model_input_length=sequence_length,
     temperatures=(0.2, 0.5, 0.7, 1., 1.5))

In [29]:
model.build(i[0].shape)
model.get_weights()
model.summary()
model.fit(lm_dataset, epochs=200, callbacks=[text_gen_callback])

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_4 (Positi  (None, None, 256)   3865600     ['input_5[0][0]']                
 onalEmbedding)                                                                                   
                                                                                                  
 transformer_decoder_4 (Transfo  (None, None, 256)   2104576     ['positional_embedding_4[0][0]', 
 rmerDecoder)                                                     'positional_embedding_4[0][0]'] 
                                                                                            

Epoch 5/200

KeyboardInterrupt: 

In [30]:
model.save('my_model')



INFO:tensorflow:Assets written to: my_model\assets


INFO:tensorflow:Assets written to: my_model\assets


In [42]:
sentence = "the movie so cool"
for i in range(15):
    tokenized_sentence = text_vectorization([sentence])
    predictions = model(tokenized_sentence)
    next_token = sample_next(predictions[0, i, :])
    sampled_token = tokens_index[next_token]
    sentence += " " + sampled_token
print(sentence)

the movie so cool two is wonderful plot of great excellent bit women humor acting of in as in


In [43]:
model.save('my_model.h5')