Predict the next set of characters using the previous characters
---------------------------------------------------------------

In [None]:
pip install trax

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os
import trax
import trax.fastmath.numpy as np
import pickle
import numpy
import random as rnd
from trax import fastmath
from trax import layers as tl

# set random seed
rnd.seed(32)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
%ls

[0m[01;34mgdrive[0m/  [01;34msample_data[0m/


In [None]:
%pwd

'/content'

In [None]:
%cd gdrive/MyDrive/GRU

/content/gdrive/MyDrive/GRU


In [None]:
%ls

[0m[01;34mdata[0m/


In [None]:
dirname = 'data/'
lines = [] # storing all the lines in a variable. 
for filename in os.listdir(dirname):
    with open(os.path.join(dirname, filename)) as files:
        for line in files:
            # remove leading and trailing whitespace
            pure_line = line.strip()
            
            # if pure_line is not the empty string,
            if pure_line:
                # converting to lowercase and appending it to the list
                lines.append(pure_line.lower())

In [None]:
n_lines = len(lines)
print(f"Number of lines: {n_lines}")
print(f"Sample line at position 0 {lines[0]}")
print(f"Sample line at position 999 {lines[999]}")

Number of lines: 125097
Sample line at position 0 the comedy of errors
Sample line at position 999 an ell and three quarters, will not measure her from


In [None]:
# go through each line
for i, line in enumerate(lines):
    # convert to all lowercase
    lines[i] = line.lower()

print(f"Number of lines: {n_lines}")
print(f"Sample line at position 0 {lines[0]}")
print(f"Sample line at position 999 {lines[999]}")

Number of lines: 125097
Sample line at position 0 the comedy of errors
Sample line at position 999 an ell and three quarters, will not measure her from


In [None]:
eval_lines = lines[-1000:] # Create a holdout validation set
lines = lines[:-1000] # Leave the rest for training

print(f"Number of lines for training: {len(lines)}")
print(f"Number of lines for validation: {len(eval_lines)}")

Number of lines for training: 124097
Number of lines for validation: 1000


###Convert a Line to Tensor

In [None]:
def line_to_tensor(line, EOS_int=1):
    """Turns a line of text into a tensor
    Args:
        line (str): A single line of text.
        EOS_int (int, optional): End-of-sentence integer. Defaults to 1.
    Returns:
        list: a list of integers (unicode values) for the characters in the `line`.
    """
    # Initialize the tensor as an empty list
    tensor = []
    # for each character:
    for c in line:  
        # convert to unicode int
        c_int = ord(c)
        
        # append the unicode integer to the tensor list
        tensor.append(c_int)
    
    # include the end-of-sentence integer
    tensor.append(EOS_int)

    return tensor

In [None]:
# Testing the output
line_to_tensor('abc xyz')

[97, 98, 99, 32, 120, 121, 122, 1]

###Batch Generator

In [None]:
def data_generator(batch_size, max_length, data_lines, line_to_tensor=line_to_tensor, shuffle=True):
    """Generator function that yields batches of data
    Args:
        batch_size (int): number of examples (in this case, sentences) per batch.
        max_length (int): maximum length of the output tensor.
        NOTE: max_length includes the end-of-sentence character that will be added
                to the tensor.  
        data_lines (list): list of the sentences to group into batches.
        line_to_tensor (function, optional): function that converts line to tensor. Defaults to line_to_tensor.
        shuffle (bool, optional): True if the generator should generate random batches of data. Defaults to True.
    Yields:
        tuple: two copies of the batch and mask 
    """
    # initialize the index that points to the current position in the lines index array
    index = 0
    
    # initialize the list that will contain the current batch
    cur_batch = []
    
    # count the number of lines in data_lines
    num_lines = len(data_lines)
    
    # create an array with the indexes of data_lines that can be shuffled
    lines_index = [*range(num_lines)]
    
    # shuffle line indexes if shuffle is set to True
    if shuffle:
        rnd.shuffle(lines_index)
    
    while True:
        
        # if the index is greater or equal than to the number of lines in data_lines
        if index >= num_lines:
            # then reset the index to 0
            index = 0
            # shuffle line indexes if shuffle is set to True
            if shuffle:
                rnd.shuffle(lines_index)
            
        # get a line at the `lines_index[index]` position in data_lines
        line = data_lines[lines_index[index]]
        
        # if the length of the line is less than max_length
        if len(line) < max_length:
            # append the line to the current batch
            cur_batch.append(line)
            
        # increment the index by one
        index += 1
        
        # if the current batch is now equal to the desired batch size
        if len(cur_batch) == batch_size:
            
            batch = []
            mask = []
            
            # go through each line (li) in cur_batch
            for li in cur_batch:
                # convert the line (li) to a tensor of integers
                tensor = line_to_tensor(li)
                
                # Create a list of zeros to represent the padding
                pad = [0] * (max_length - len(tensor))
                
                # combine the tensor plus pad
                tensor_pad = tensor + pad
                
                # append the padded tensor to the batch
                batch.append(tensor_pad)

                # A mask for  tensor_pad is 1 wherever tensor_pad is not 0 otherwise 0
                example_mask = [0 if t == 0 else 1 for t in tensor_pad]
                mask.append(example_mask)
               
            # convert the batch (data type list) to a trax's numpy array
            batch_np_arr = np.array(batch)
            mask_np_arr = np.array(mask)
                    
            # Yield two copies of the batch and mask.
            yield batch_np_arr, batch_np_arr, mask_np_arr
            
            # reset the current batch to an empty list
            cur_batch = []

In [None]:
# Testing the data generator
tmp_lines = ['12345678901', #length 11
             '1234567', # length 7
             '234567890', # length 9
             '34567', # length 10
             '234366775'] # length 9

# Get a batch size of 3, max length 10
tmp_data_gen = data_generator(batch_size=3, 
                              max_length=10, 
                              data_lines=tmp_lines,
                              shuffle=False)

# get one batch
tmp_batch = next(tmp_data_gen)

# view the batch
tmp_batch



(DeviceArray([[49, 50, 51, 52, 53, 54, 55,  1,  0,  0],
              [50, 51, 52, 53, 54, 55, 56, 57, 48,  1],
              [51, 52, 53, 54, 55,  1,  0,  0,  0,  0]], dtype=int32),
 DeviceArray([[49, 50, 51, 52, 53, 54, 55,  1,  0,  0],
              [50, 51, 52, 53, 54, 55, 56, 57, 48,  1],
              [51, 52, 53, 54, 55,  1,  0,  0,  0,  0]], dtype=int32),
 DeviceArray([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
              [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]], dtype=int32))

Lines with length equal to or more than 'max_length' is eliminated

###To cycle over the dataset multiple times during training (i.e. train for multiple epochs)

In [None]:
import itertools

infinite_data_generator = itertools.cycle(
    data_generator(batch_size=2, max_length=10, data_lines=tmp_lines))

In [None]:
ten_lines = [next(infinite_data_generator) for _ in range(10)]
print(len(ten_lines))

10


In [None]:
[tup[0] for tup in ten_lines]

[DeviceArray([[50, 51, 52, 53, 54, 55, 56, 57, 48,  1],
              [51, 52, 53, 54, 55,  1,  0,  0,  0,  0]], dtype=int32),
 DeviceArray([[50, 51, 52, 51, 54, 54, 55, 55, 53,  1],
              [49, 50, 51, 52, 53, 54, 55,  1,  0,  0]], dtype=int32),
 DeviceArray([[50, 51, 52, 51, 54, 54, 55, 55, 53,  1],
              [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32),
 DeviceArray([[49, 50, 51, 52, 53, 54, 55,  1,  0,  0],
              [51, 52, 53, 54, 55,  1,  0,  0,  0,  0]], dtype=int32),
 DeviceArray([[51, 52, 53, 54, 55,  1,  0,  0,  0,  0],
              [50, 51, 52, 51, 54, 54, 55, 55, 53,  1]], dtype=int32),
 DeviceArray([[49, 50, 51, 52, 53, 54, 55,  1,  0,  0],
              [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32),
 DeviceArray([[49, 50, 51, 52, 53, 54, 55,  1,  0,  0],
              [50, 51, 52, 51, 54, 54, 55, 55, 53,  1]], dtype=int32),
 DeviceArray([[50, 51, 52, 53, 54, 55, 56, 57, 48,  1],
              [51, 52, 53, 54, 55,  1,  0,  0,  0,  0]]

Defining the GRU model
----------------------

In [None]:
def GRULM(vocab_size=256, d_model=512, n_layers=2, mode='train'):
    """Returns a GRU language model.
    Args:
        vocab_size (int, optional): Size of the vocabulary. Defaults to 256.
        d_model (int, optional): Depth of embedding (n_units in the GRU cell). Defaults to 512.
        n_layers (int, optional): Number of GRU layers. Defaults to 2.
        mode (str, optional): 'train', 'eval' or 'predict', predict mode is for fast inference. Defaults to "train".
    Returns:
        trax.layers.combinators.Serial: A GRU language model as a layer that maps from a tensor of tokens to activations over a vocab set.
    """

    model = tl.Serial(
      tl.ShiftRight(mode = mode), # Stack the ShiftRight layer
      tl.Embedding(vocab_size = vocab_size, d_feature = d_model), # Stack the embedding layer
      [tl.GRU(n_units = d_model) for _ in range(n_layers)], # Stack GRU layers of d_model units using n_layer parameter
      tl.Dense(n_units=vocab_size), # Dense layer
      tl.LogSoftmax() # Log Softmax
    )
    return model

In [None]:
# testing your model
model = GRULM()
print(model)

Serial[
  Serial[
    ShiftRight(1)
  ]
  Embedding_256_512
  GRU_512
  GRU_512
  Dense_256
  LogSoftmax
]


Training
--------

In [None]:
batch_size = 32
max_length = 64

In [None]:
def n_used_lines(lines, max_length):
    '''
    Args: 
    lines: all lines of text
    max_length - max_length of a line in order to be considered
    Return:
    number of effective examples
    '''

    n_lines = 0
    for l in lines:
        if len(l) <= max_length:
            n_lines += 1
    return n_lines

In [None]:
num_used_lines = n_used_lines(lines, 32)
print('Number of used lines from the dataset:', num_used_lines)
print('Batch size (a power of 2):', int(batch_size))
steps_per_epoch = int(num_used_lines/batch_size)
print('Number of steps to cover one epoch:', steps_per_epoch)

Number of used lines from the dataset: 25797
Batch size (a power of 2): 32
Number of steps to cover one epoch: 806


###Training the Model

In [None]:
from trax.supervised import training

def train_model(model, data_generator, batch_size=32, max_length=64, lines=lines, eval_lines=eval_lines, n_steps=1, output_dir='model/'): 
    """Function that trains the model
    Args:
        model (trax.layers.combinators.Serial): GRU model.
        data_generator (function): Data generator function.
        batch_size (int, optional): Number of lines per batch. Defaults to 32.
        max_length (int, optional): Maximum length allowed for a line to be processed. Defaults to 64.
        lines (list, optional): List of lines to use for training. Defaults to lines.
        eval_lines (list, optional): List of lines to use for evaluation. Defaults to eval_lines.
        n_steps (int, optional): Number of steps to train. Defaults to 1.
        output_dir (str, optional): Relative path of directory to save model. Defaults to "model/".
    Returns:
        trax.supervised.training.Loop: Training loop for the model.
    """
    
    bare_train_generator = data_generator(batch_size, max_length, data_lines = lines)
    infinite_train_generator = itertools.cycle(bare_train_generator)
    
    bare_eval_generator = data_generator(batch_size, max_length, data_lines = eval_lines)
    infinite_eval_generator = itertools.cycle(bare_eval_generator)
   
    train_task = training.TrainTask(
        labeled_data = infinite_train_generator, 
        loss_layer = tl.CrossEntropyLoss(),   
        optimizer = trax.optimizers.Adam(0.005)     
    )

    eval_task = training.EvalTask(
        labeled_data = infinite_eval_generator,    
        metrics = [tl.CrossEntropyLoss(), tl.Accuracy()], 
        n_eval_batches = 3      
    )
    
    training_loop = training.Loop(
                                  model,
                                  train_task,
                                  eval_tasks = eval_task,
                                  output_dir = output_dir
                                  )

    training_loop.run(n_steps = n_steps)
    
    return training_loop

In [None]:
training_loop = train_model(GRULM(), data_generator, n_steps = 1000)


Step      1: Total number of trainable weights: 3411200
Step      1: Ran 1 train steps in 5.65 secs
Step      1: train CrossEntropyLoss |  5.54514122
Step      1: eval  CrossEntropyLoss |  5.50072543
Step      1: eval          Accuracy |  0.16009497

Step    100: Ran 99 train steps in 143.85 secs
Step    100: train CrossEntropyLoss |  2.75114369
Step    100: eval  CrossEntropyLoss |  2.26311048
Step    100: eval          Accuracy |  0.33529158

Step    200: Ran 100 train steps in 152.19 secs
Step    200: train CrossEntropyLoss |  2.05339527
Step    200: eval  CrossEntropyLoss |  1.94884515
Step    200: eval          Accuracy |  0.42267636

Step    300: Ran 100 train steps in 153.53 secs
Step    300: train CrossEntropyLoss |  1.85381770
Step    300: eval  CrossEntropyLoss |  1.79082656
Step    300: eval          Accuracy |  0.46654411

Step    400: Ran 100 train steps in 151.47 secs
Step    400: train CrossEntropyLoss |  1.73168683
Step    400: eval  CrossEntropyLoss |  1.74559045
Step

Evaluation
----------

In [None]:
def test_model(preds, target):
    """Function to test the model.

    Args:
        preds (jax.interpreters.xla.DeviceArray): Predictions of a list of batches of tensors corresponding to lines of text.
        target (jax.interpreters.xla.DeviceArray): Actual list of batches of tensors corresponding to lines of text.
    Returns:
        float: log_perplexity of the model.
    """
    total_log_ppx = np.sum(preds * tl.one_hot(target, preds.shape[-1]),axis= -1)

    non_pad = 1.0 - np.equal(target, 0)          # check if the target equals 0
    ppx = total_log_ppx * non_pad                       # Get rid of the padding

    log_ppx = np.sum(ppx) / np.sum(non_pad)
    
    return -log_ppx

In [None]:
# Testing 
model = GRULM()
model.init_from_file('model/model.pkl.gz')
batch = next(data_generator(batch_size, max_length, lines, shuffle=False))
preds = model(batch[0])
log_ppx = test_model(preds, batch[1])
print('The log perplexity and perplexity of your model are respectively', log_ppx, np.exp(log_ppx))

The log perplexity and perplexity of your model are respectively 1.7787589 5.9225016


Generating the language with your the model
-----------------------------------------

In [None]:
# Run this cell to generate some news sentence
def gumbel_sample(log_probs, temperature=1.0):
    """Gumbel sampling from a categorical distribution."""
    u = numpy.random.uniform(low=1e-6, high=1.0 - 1e-6, size=log_probs.shape)
    g = -np.log(-np.log(u))
    return np.argmax(log_probs + g * temperature, axis=-1)

def predict(num_chars, prefix):
    inp = [ord(c) for c in prefix]
    result = [c for c in prefix]
    max_len = len(prefix) + num_chars
    for _ in range(num_chars):
        cur_inp = np.array(inp + [0] * (max_len - len(inp)))
        outp = model(cur_inp[None, :])  
        next_char = gumbel_sample(outp[0, len(inp)])
        inp += [int(next_char)]
       
        if inp[-1] == 1:
            break  # EOS
        result.append(chr(int(next_char)))
    
    return "".join(result)

print(predict(32, ""))

produre is any courages and


In [None]:
print(predict(32, ""))
print(predict(32, ""))
print(predict(32, ""))

pyrcy is moves my loved me oweve
you lothing it word and my duty,
that if farshipy chrosses and na


In [None]:
print(predict(32, "I love"))

I love of york's word of my near; him 


The model generates text that makes sense capturing dependencies between words and without any input.