# Deep N-grams

1. How to convert a line of text into a tensor
2. Create an iterator to feed data to the model
3. Define a GRU model using trax
4. Train the model using trax
5. Compute the accuracy of your model using the perplexity
6. Predict using your own model

In [4]:
import os
import shutil
import trax
import trax.fastmath.numpy as np
import pickle
import numpy
import random as rnd
from trax import fastmath
from trax import layers as tl

Matplotlib is building the font cache; this may take a moment.


In [12]:
lines = []
with open('./data/shakespeare.txt','r') as f:
    files = f.read()
    for line in files:
        striped_line = line.strip()
        if striped_line:
            lines.append(striped_line)

In [15]:
len(lines)

230397

In [18]:
#convert everything in lower case
for i, line in enumerate(lines):
    lines[i] = line.lower()

In [39]:
eval_lines = lines[-1000:] # Create a holdout validation set
lines = lines[:-1000] # Leave the rest for training

print(f"Number of lines for training: {len(lines)}")
print(f"Number of lines for validation: {len(eval_lines)}")

Number of lines for training: 229397
Number of lines for validation: 1000


### Convert a line to tensor

In [21]:
def line_to_tensor(line, EOS_int = 1):
    res = []
    for c in line:
        res.append(ord(c))
    res.append(EOS_int)
    return res

In [22]:
line_to_tensor('I am learning NLP')

[73, 32, 97, 109, 32, 108, 101, 97, 114, 110, 105, 110, 103, 32, 78, 76, 80, 1]

In [30]:
def data_generator(batch_size, max_length,data_lines, line_to_tensor = line_to_tensor, shuffle=True):
    index = 0
    num_of_lines = len(data_lines)
    line_index = [*range(num_of_lines)]
    curr_batch = []
    if shuffle:
        rnd.shuffle(line_index)

    while True:

        if index >= num_of_lines:
            index = 0 
            if shuffle:
                rnd.shuffle(line_index)

        line = data_lines[line_index[index]]

        if len(line) < max_length:
            curr_batch.append(line)

        index += 1

        if len(curr_batch) == batch_size:
            batch = []
            mask = []
            for l in curr_batch:
                tensor = line_to_tensor(l)
                pad = [0]*(max_length - len(tensor))
                batch.append(tensor+pad)
                mask.append(len(tensor)*[1] + len(pad)*[0])
            batch_np_arr = np.array(batch)
            mask_np_arr = np.array(mask)

            yield batch_np_arr, batch_np_arr, mask_np_arr

            cur_batch = []
            

                
        
    

In [31]:
# Try out your data generator
tmp_lines = ['12345678901', #length 11
             '123456789', # length 9
             '234567890', # length 9
             '345678901'] # length 9

# Get a batch size of 2, max length 10
tmp_data_gen = data_generator(batch_size=2, 
                              max_length=10, 
                              data_lines=tmp_lines,
                              shuffle=False)

# get one batch
tmp_batch = next(tmp_data_gen)

# view the batch
tmp_batch

(Array([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
        [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32),
 Array([[49, 50, 51, 52, 53, 54, 55, 56, 57,  1],
        [50, 51, 52, 53, 54, 55, 56, 57, 48,  1]], dtype=int32),
 Array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32))

The way the iterator is currently defined, it will keep providing batches forever.
Although it is not needed, we want to show you the itertools.cycle function which is really useful when the generator eventually stops
Notice that it is expected to use this function within the training function further below
Usually we want to cycle over the dataset multiple times during training (i.e. train for multiple epochs).
For small datasets we can use itertools.cycle to achieve this easily.

In [32]:
import itertools
infinite_data_generator = itertools.cycle(
    data_generator(batch_size=2, max_length=10, data_lines=tmp_lines))

### GRU Model

In [33]:
def GRULM(vocab_size=256, d_model=512, n_layers=2, mode='train'):
    """Returns a GRU language model.

    Args:
        vocab_size (int, optional): Size of the vocabulary. Defaults to 256.
        d_model (int, optional): Depth of embedding (n_units in the GRU cell). Defaults to 512.
        n_layers (int, optional): Number of GRU layers. Defaults to 2.
        mode (str, optional): 'train', 'eval' or 'predict', predict mode is for fast inference. Defaults to "train".

    Returns:
        trax.layers.combinators.Serial: A GRU language model as a layer that maps from a tensor of tokens to activations over a vocab set.
    """
    model = tl.Serial( 
      tl.ShiftRight(mode=mode), # Stack the ShiftRight layer
      tl.Embedding(vocab_size=vocab_size,d_feature=d_model), # Stack the embedding layer
      *(tl.GRU(n_units=d_model) for i in range(n_layers)), # Stack GRU layers of d_model units keeping n_layer parameter in mind (use list comprehension syntax)
      tl.Dense(n_units=vocab_size), # Dense layer
      tl.LogSoftmax(), # Log Softmax
    ) 
    
    return model

In [34]:
model = GRULM()
print(model)

Serial[
  Serial[
    ShiftRight(1)
  ]
  Embedding_256_512
  GRU_512
  GRU_512
  Dense_256
  LogSoftmax
]


### Training

In [36]:
batch_size = 32
max_length = 64
def n_used_lines(lines, max_length):
    '''
    Args: 
    lines: all lines of text an array of lines
    max_length - max_length of a line in order to be considered an int
    output_dir - folder to save your file an int
    Return:
    number of efective examples
    '''

    n_lines = 0
    for l in lines:
        if len(l) <= max_length:
            n_lines += 1
    return n_lines

num_used_lines = n_used_lines(lines, 32)
print('Number of used lines from the dataset:', num_used_lines)
print('Batch size (a power of 2):', int(batch_size))
steps_per_epoch = int(num_used_lines/batch_size)
print('Number of steps to cover one epoch:', steps_per_epoch)

Number of used lines from the dataset: 230397
Batch size (a power of 2): 32
Number of steps to cover one epoch: 7199


In [41]:
from trax.supervised import training

In [45]:
def train_model(model, data_generator, lines, eval_lines, batch_size=32, max_length=64, n_steps=1, output_dir='model/'): 
    bare_train_generator = data_generator(batch_size,max_length,lines)
    infinite_train_generator = itertools.cycle(bare_train_generator)
    
    bare_eval_generator = data_generator(batch_size,max_length,eval_lines)
    infinite_eval_generator = itertools.cycle(bare_eval_generator)

    train_task = training.TrainTask(
        labeled_data = infinite_train_generator,
        loss_layer = tl.CrossEntropyLoss(),
        optimizer = trax.optimizers.Adam(learning_rate = 0.0005)
    )
    eval_task = training.EvalTask(
        labeled_data = infinite_eval_generator,
        metrics = [tl.CrossEntropyLoss(),tl.Accuracy()],
        n_eval_batches = 3
    )
    training_loop = training.Loop(
        model,
        train_task,
        eval_tasks = [eval_task],
        output_dir = output_dir
    )
    return training_loop

In [46]:
# Train the model 1 step and keep the `trax.supervised.training.Loop` object.
output_dir = './model/'

try:
    shutil.rmtree(output_dir)
except OSError as e:
    pass

training_loop = train_model(GRULM(), data_generator, lines=lines, eval_lines=eval_lines)



### Evaluation

In [52]:
def test_model(preds, target):

    log_p = np.sum(preds* tl.one_hot(target, pred.shape[-1]), axis = -1)
    non_pad = 1.0 - np.equal(target,0)
    log_p = log_p * non_pad 
    log_ppx = np.sum(log_p, axis = 1) / np.sum(non_pad, axis = 1)
    log_ppx = np.mean(log_ppx)

    return -log_ppx