In [1]:
# Randomness is required while using text generation so that the answers generated by the model are less predictable and more creative
# However, there should be a balance in the randomness ( low randomness means predictable boring answers, high randomness means hypothetical creative answers which don't make any sense, so we need a intermediate randomness)
# Softmax is used to predict the next word using the probability of the words ( By introducing randomness, if a word has 0.4 , then the word will be chosen 40 percent of the time)
# To control the randomness , we use a term called "softmax temperature" - 0.0 to 1.0 range --> 0 means low entropy i.e boring answers or no randomness, 1.0 means high entropy i.e hypothetical creative answers

In [2]:
# How the softmax temperature function works

import numpy as np

def reweight_distribution(original_distribution,softmax_temperature=0.5): # Original distribution is 1D numpy array where probabilities sum up to 1 ( probability of words to be the next word in the sentence)

    new_distribution = np.log(original_distribution) / softmax_temperature
    new_distribution = np.exp(new_distribution)
    return new_distribution / np.sum(new_distribution) # Returns a reweighted version of the original distribution( The sum might not be equal to 1, so to make it equal to 1 , we divide by the sum)

In [3]:
import tensorflow as tf
import keras

In [4]:
# Using IMDB dataset

# Downloading the dataset
#dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz",
#                                  origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
#                                  extract=True)
IMDB_DATASET = r'C:\Users\arany\.keras\datasets\aclimdb'

dataset = keras.utils.text_dataset_from_directory(directory=IMDB_DATASET,label_mode=None,batch_size=256) # Since we need a huge amount of data and we dont want any classification tasks, so we dont need the labels and classes(label_mode = None)

dataset = dataset.map(lambda x : tf.strings.regex_replace(x,"<br />"," ")) # Removing the <br> html tags present in the reviews(Since we are just generating words, so not needed)

Found 100006 files belonging to 1 classes.


In [5]:
from keras.layers import TextVectorization

In [6]:
# Text Vectorization ( creating the vocabulary for text generation)

sequence_length = 100
vocab_size = 15000 # Top 15000 frequent words are used, else treated as [UNK]

text_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length
)

text_vectorization.adapt(dataset) # Using only the text reviews we extracted earlier to adapt our vocabulary

In [7]:
# We now create a layer where the input sequence will the input itself as a tensor, but the target will be offset by 1(since we are training our model for generating the next word)

def prepare_lm_dataset(text_batch):
    vectorized_sequence = text_vectorization(text_batch) # Converting words to vector
    source_sequence = vectorized_sequence[:,:-1] # Creating inputs by cutting off the last word of the sequence
    target_sequence = vectorized_sequence[:,1:] # Creating targets by offsetting the sequences by 1
    return source_sequence, target_sequence

lm_dataset = dataset.map(prepare_lm_dataset,num_parallel_calls=4)

### Why do we do offsetting and cutting off in inputs and target

### Suppose the below matrix representes the vectorization of the words in the sequence

vectorized_sequence = [
    [0.1, 0.2, 0.3, 0.4],  # The
    [0.5, 0.6, 0.7, 0.8],  # cat
    [0.9, 1.0, 1.1, 1.2],  # sat
    [1.3, 1.4, 1.5, 1.6],  # on
    [1.7, 1.8, 1.9, 2.0],  # the
    [2.1, 2.2, 2.3, 2.4],  # mat
    [0.1, 0.2, 0.3, 0.4],  # The
    [2.5, 2.6, 2.7, 2.8],  # dog
    [2.9, 3.0, 3.1, 3.2],  # barked
    [3.3, 3.4, 3.5, 3.6]   # loudly
]

source_sequence = [
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2],
    [1.3, 1.4, 1.5, 1.6],
    [1.7, 1.8, 1.9, 2.0],
    [0.1, 0.2, 0.3, 0.4],
    [2.5, 2.6, 2.7, 2.8],
    [2.9, 3.0, 3.1, 3.2]
]

target_sequence = [
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2],
    [1.3, 1.4, 1.5, 1.6],
    [1.7, 1.8, 1.9, 2.0],
    [2.1, 2.2, 2.3, 2.4],
    [2.5, 2.6, 2.7, 2.8],
    [2.9, 3.0, 3.1, 3.2],
    [3.3, 3.4, 3.5, 3.6]
]

### So, the line 1 of source_sequence is passed to model as input, the 1st line of target_sequence will be passed as output i.e the next word in the sentence
### So each line number 'X' or Xth word being passed from source sequence to model has line number 'X' in target sequence which in turn is the X + 1th word in the sentence
### So at each input and target pair, the previous word is the input and the next word is the output
### Thats why we offset the target sequence by 1
### We cut of the last word in input sequence because there is no next word to be learned by the model 

In [8]:
# We use sequence to sequence modelling with the help of encoder and decoder as used previously,, but here in case of text generation, we wont have any source sequence.
# We are just trying to predict the next tokens using the past tokens with help of decoder
# Causal padding helps in decoder only looking in the 0 to Nth token/ words to predict the N + 1 th token

In [9]:
# Since there is no source sequence , we will only be using the decoder part ( decoder only model)
# Also we will be using positional embedding from before

In [10]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self,sequence_length,input_dim,output_dim,**kwargs): # The sequence length needs to be known because we need to use that as input dimension for the Positional embedding
        super().__init__()
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.token_embeddings = Embedding(input_dim=input_dim,output_dim=output_dim)
        self.positional_embeddings = Embedding(input_dim=sequence_length,output_dim=output_dim)

    def call(self,inputs):
        length = tf.shape(inputs)[-1] # Retreiving the length of the sequence
        positions = tf.range(start=0,limit=length,delta=1) # List of number positions (1,2,3,4.....length of the sentence)
        embedded_tokens = self.token_embeddings(inputs) # Word embeddings
        embedded_positions = self.positional_embeddings(positions) # Position embeddings
        return embedded_tokens + embedded_positions # Adding word and position embeddings

    def compute_mask(self,inputs,mask=None): # Creating a mask to be able to ignore the zero paddings
        return tf.math.not_equal(inputs,0)

    def get_config(self): # Created so that we can use this custom class later as a layer
        config = super().get_config()
        config.update({
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
            "output_dim": self.output_dim,
        })
        return config   

In [11]:
class TransformerDecoderClass(tf.keras.layers.Layer):
    def __init__(self,embed_dim,dense_dim,num_heads,**kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.attention_layer1 = MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
        self.attention_layer2 = MultiHeadAttention(num_heads=num_heads,key_dim=embed_dim)
        self.dense_block = Sequential([
            Dense(units=dense_dim,activation="relu"),
            Dense(units=embed_dim)
        ])
        self.layer_norm1 = LayerNormalization()
        self.layer_norm2 = LayerNormalization()
        self.layer_norm3 = LayerNormalization()
        self.supports_masking = True # Ensures that layer will propogate its input mask to its output

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads
        })
        return config
    
    # Causal padding ensures that during self-attention calculations in the transformer, each token only attends to the previous tokens in the sequence, not the future ones.
    def get_causal_attention_mask(self,inputs): # Causual padding implementation ( Since the transformer model has access to the whole sequence , so that it doesn't directly copy while predicting the N+1 token, we pad the future elements in the sequence)
        input_shape = tf.shape(inputs)
        batch_size , sequence_length = input_shape[0] , input_shape[1]
        # These lines generate two tensors i and j, where i represents a range from 0 to sequence_length - 1 along columns and j represents the same range along rows.
        i = tf.range(sequence_length)[:,tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j,dtype="int32") # This line creates a mask where each element is 1 if the corresponding element in i is greater than or equal to the corresponding element in j, and 0 otherwise. 
        # This ensures that each token only attends to itself and the previous tokens, not the future ones.
        mask = tf.reshape(mask,(1,input_shape[1],input_shape[1])) # Reshaping done so that the mask has the correct dimensions to be compatible with the subsequent tiling operation and matches the shape expected by the attention mechanism in the transformer model.
        mult = tf.concat(
            [tf.expand_dims(batch_size,-1), # This line helps us determine how many times the mask matrix will be repeated during tiling
             tf.constant([1,1],dtype="int32")],axis=0) # Here [1,1] means that the mask matrix will not be tilled in other dimensions
        return tf.tile(mask,mult) # Tiling process means copying the mask matrix for different dimensions(here only 1 dimension whose number of times to be replicated depends on the batch size)
    
    def call(self,inputs,encoder_outputs,mask=None): # inputs is the target sequence provided to decoder as input, encoder_inputs is the representation of the source sequence of the encoder
        causal_mask = self.get_causal_attention_mask(inputs) # Retreiving the causal mask
        
        # If a padding mask is provided, it's first cast to an integer type and expanded to match the shape of the causal mask. 
        #Then, a minimum operation is performed element-wise between the padding mask and the causal mask. 
        #This step ensures that the model doesn't attend to the padded elements during the attention calculation.
        if mask is not None:
            padding_mask = tf.cast(mask[:,tf.newaxis,:],dtype="int32") # Preparing the input mask which describes the padding locations in the target sequence
            padding_mask = tf.minimum(padding_mask,causal_mask) # Merging the masks together
        
        # Attention layer 1 has only the inputs sent to the decoder, so the inputs will be the query, key and value for the layer
        # Causal mask only applied here because the model only has the source sequence 
        attention_output_1 = self.attention_layer1(query=inputs,key=inputs,value=inputs,attention_mask=causal_mask) # Pass the causal mask to the first attention layer, which performs self attention over target sequence        
        attention_output_1 = self.layer_norm1(inputs + attention_output_1) # Applying layer normalization and residual connection
        
        # Attention layer 2 has the attention scores and outputs from the previous attention layer which will be the query here, and the outputs sent by the encoder will be the value and key here ( since we are using context information from the encoder as the key and corresponding values to predict the next token)
        # Padding mask is used since the model has both target and source sequence here
        attention_output_2 = self.attention_layer2(query=attention_output_1,key=encoder_outputs,value=encoder_outputs,attention_mask=padding_mask) # Pass the padding mask to the second attention layer, which relates the source sequence to the target sequence
        attention_output_2 = self.layer_norm2(attention_output_1 + attention_output_2) # Applying layer normalization and residual connection
        
        proj_output = self.dense_block(attention_output_2) # Dense layer block
        return self.layer_norm3(attention_output_2 + proj_output) # Apply layer normalization and residual connection


In [12]:
from keras.layers import Input,Dense,Embedding,MultiHeadAttention,LayerNormalization
from keras.models import Model
from keras import Sequential

In [13]:
embed_dim = 256
dense_dim = 1024
num_heads = 2

input_layer = Input(shape=(None,),dtype="int64")
positional_embedding_layer = PositionalEmbedding(sequence_length=sequence_length,input_dim=vocab_size,output_dim=embed_dim)(input_layer)
decoder_layer = TransformerDecoderClass(embed_dim=embed_dim,dense_dim=dense_dim,num_heads=num_heads)(positional_embedding_layer,positional_embedding_layer) # Passing the positional embedding layer twice in the decoder layer is necessary for allowing the self-attention mechanism to consider both the input tokens and their positional information when generating the output sequence in your text generation model
# Since we don't have a source sequence and an encoder, so we need to get both the positional information and the previous tokens both from positional embedding
output_layer = Dense(vocab_size,activation="softmax")(decoder_layer) # Vocab_size and softmax activation so that we get a probability distribution of each word present in the vocabulary that it can be the next token

decoder_model = Model(input_layer,output_layer)
decoder_model.compile(optimizer="rmsprop",loss="sparse_categorical_crossentropy")

In [14]:
def get_vocabulary(self): # Function created to return the dictionary ( Facing 'utf-8' decode issue while using get_vocabulary method for text vectorization)
        # _layer.get_vocabulary
        keys, values = self._lookup_layer.lookup_table.export()
        # print(self._lookup_layer.lookup_table.export())
        vocab = []
        for i in keys : 
            try :
                vocab.append(i.numpy().decode('utf-8'))
            except :
                vocab.append(i.numpy().decode('ISO-8859-1'))
        return vocab

In [15]:
tokens_index = dict(enumerate(get_vocabulary(text_vectorization))) # We are creating a dictionary that maps the word indices to strings(used for text decoding)
def sample_next(predictions, temperature=1.0): # Function for implementing variable temperature sampling from a probability distribution ( the first code block in the notebook)
    predictions = np.asarray(predictions).astype("float64")
    predictions = np.log(predictions) / temperature
    exp_preds = np.exp(predictions)
    predictions = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, predictions, 1)
    return np.argmax(probas)

class TextGenerator(keras.callbacks.Callback): # We will use this as a callback to fit our model
    def __init__(self,
        prompt, # prompt that will be model input for text generation
        generate_length, # number of words to generate
        model_input_length, # length of the inputs we used to train the model
        temperatures=(1.,), # range of temperatures
        print_freq=1,
        model=decoder_model): # Our decoder model mentioned here
        self.prompt = prompt
        self.generate_length = generate_length
        self.model_input_length = model_input_length
        self.temperatures = temperatures
        self.print_freq = print_freq

    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.print_freq != 0:
            return
        for temperature in self.temperatures:
            print("== Generating with temperature", temperature)
            sentence = self.prompt # our prompt(input for text generation) will be the initial part of the final output sequence 
            for i in range(self.generate_length):
                tokenized_sentence = text_vectorization([sentence])
                predictions = self.model(tokenized_sentence) # next word prediction probability distribution by the model
                next_token = sample_next(predictions[0, i, :]) # using the prediction of the model with the temperature range to get the next token
                sampled_token = tokens_index[next_token]
                sentence += " " + sampled_token # Adding the next word in the sentence ( which acts as the input prompt for the next epoch)
            print(sentence)

prompt = "This movie" # Initial prompt
text_gen_callback = TextGenerator( #Callback function will call this Text Generator class
prompt,
generate_length=50,
model_input_length=sequence_length,
temperatures=(0.2, 0.5, 0.7, 1., 1.5)) # Range of temperatures

In [17]:
decoder_model.fit(lm_dataset,epochs=2,callbacks=[text_gen_callback])

Epoch 1/2
This movie defy criteria glamorous consequences preaching construct nightmare iron thornton illegal gigantic awe warrant consequences locke annoys proved fantastically captive jacqueline door scheming portraits warrant nightmare locke maiden warrant paintings locke locke confronts demand blonde thwarted warrant suffered glamorous locke meg suffered certain futuristic ole corridors somethings skin witherspoon warrant groan
== Generating with temperature 0.5
This movie loud dutch shapes stalking notices duplicate beggars 16mm evie confined unreal consequences of pretensions wade eager updated consequences skin 16mm fuss siege ran suffered drawback misfire former locke misfortune criteria finland thwarted spencer suffered destined bandits consequences handsome skin skin former politician climate bettany biko wakes addresses sorority david agents
== Generating with temperature 0.7
This movie bimbos criteria india criteria forwarded thwarted wholly bye peer magnetic reflections ma

<keras.callbacks.History at 0x1df5a96d5d0>