# Basic NLP with LSTM

First let's get the modules needed to build the model

In [6]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import re

The text that is going to be used to train the network is the Old testament of the bible

In [11]:
# open text file and read in data as `text`
def read_The_Old_Testament():
    with open('./data/Book/the_old_testament.txt', 'r') as f:
        text = f.read()
    return re.sub('[^A-Za-z.,;\n]+', ' ', text).strip().lower()

In [13]:
#Now let's look at the first 200 characters of the file.
text = read_The_Old_Testament()
text[:200]


'the old testament \n\n genesis\n\n in the beginning god created the heaven and the earth. and the earth was without form, and void; and darkness was upon the face of the deep. and the spirit of god moved '

The first step to built out learning model is to tokenize the data (Divide it in characters and and build look up tables to a numeric representation of the characters to feed the network). to do this we define two dictionaries:
1. int2char, which maps integers to characters
2. char2int, which maps characters to unique integers

In [19]:
chars = tuple(set(text)) # set of all unique characters
# print(len(chars))
int2char = dict(enumerate(chars)) # dictionary mapping integers to characters
char2int = {ch: ii for ii, ch in int2char.items()} # dictionary mapping characters to integers

# encode the text
encoded = np.array([char2int[ch] for ch in text])
print(encoded[:100])


[ 2  6 13 23 18  9 21 23  2 13  8  2 15 14 13 27  2 23  5  5 23 10 13 27
 13  8  4  8  5  5 23  4 27 23  2  6 13 23  3 13 10  4 27 27  4 27 10 23
 10 18 21 23 26 24 13 15  2 13 21 23  2  6 13 23  6 13 15 28 13 27 23 15
 27 21 23  2  6 13 23 13 15 24  2  6 29 23 15 27 21 23  2  6 13 23 13 15
 24  2  6 23]


Now we turn every encoded character into a one-hot vector that is the expected input to the LSTM Net

In [26]:
def one_hot_encode(arr, n_labels):
    
    # Initialize the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)

    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.

    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))

    return one_hot

Now we create mini-batches of data

In [34]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    batch_size_total = batch_size * seq_length
    # total number of batches we can make
    n_batches = len(arr)//batch_size_total
    
    # Keep only enough characters to make full batches
    arr = arr[:n_batches * batch_size_total]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try: # Slides the window one step forward
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError: # Definition of the end of data case for the targets
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

Definition of the Network

In [37]:
# check if GPU is available
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('Training on GPU!')
else: 
    print('No GPU available, training on CPU; consider making n_epochs very small.')

Training on GPU!


In [41]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        # define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        # Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)
        
        # pass through a dropout layer
        out = self.dropout(r_output)

        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        # put x through the fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

Now let's define the training loop

In [42]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

Now we create a instance of the model and trainit

In [43]:
# define and print the net
n_hidden=512
n_layers=2

net = CharRNN(chars, n_hidden, n_layers)
print(net)

CharRNN(
  (lstm): LSTM(31, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=31, bias=True)
)


In [44]:
batch_size = 128
seq_length = 100
n_epochs = 20 # start smaller if you are just testing initial behavior

# train the model
train(net, encoded, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=10)

Epoch: 1/20... Step: 10... Loss: 2.9083... Val Loss: 2.8767
Epoch: 1/20... Step: 20... Loss: 2.8767... Val Loss: 2.8644
Epoch: 1/20... Step: 30... Loss: 2.8750... Val Loss: 2.8617
Epoch: 1/20... Step: 40... Loss: 2.8676... Val Loss: 2.8533
Epoch: 1/20... Step: 50... Loss: 2.8656... Val Loss: 2.8362
Epoch: 1/20... Step: 60... Loss: 2.7984... Val Loss: 2.7642
Epoch: 1/20... Step: 70... Loss: 2.6666... Val Loss: 2.6243
Epoch: 1/20... Step: 80... Loss: 2.5156... Val Loss: 2.4797
Epoch: 1/20... Step: 90... Loss: 2.3441... Val Loss: 2.3122
Epoch: 1/20... Step: 100... Loss: 2.2387... Val Loss: 2.2154
Epoch: 1/20... Step: 110... Loss: 2.2108... Val Loss: 2.1475
Epoch: 1/20... Step: 120... Loss: 2.1531... Val Loss: 2.1044
Epoch: 1/20... Step: 130... Loss: 2.0999... Val Loss: 2.0525
Epoch: 1/20... Step: 140... Loss: 2.0901... Val Loss: 2.0112
Epoch: 1/20... Step: 150... Loss: 2.0261... Val Loss: 1.9720
Epoch: 1/20... Step: 160... Loss: 2.0080... Val Loss: 1.9313
Epoch: 1/20... Step: 170... Loss:

Let's save a checkpoint of the training we have done to this point

In [46]:
# change the name, for saving multiple files
model_name = './Data/save/rnn_20_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f:
    torch.save(checkpoint, f)

Now let's make predictions, we must take into account that:
> Our predictions come from a categorical probability distribution over all the possible characters. We can make the sample text and make it more reasonable to handle (with less variables) by only considering some  𝐾
  most probable characters. This will prevent the network from giving us completely absurd characters while allowing it to introduce some noise and randomness into the sampled text.

In [47]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''
        
        # tensor inputs
        x = np.array([[net.char2int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)
        
        if(train_on_gpu):
            inputs = inputs.cuda()
        
        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu
        
        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        
        # return the encoded value of the predicted char and the hidden state
        return net.int2char[char], h

In [48]:
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [50]:
print(sample(net, 1000, prime='moses', top_k=5))

moses there is no more any that shall come to perform the land of egypt, that i may see his houses, and they shall say, that which is in the land which i have commanded you from the land which ye shall not dwelt in my hand and when they shall dividing all the workers of iniquity. and it shall be. wherefore the king said to see, the words of the priest shall be a feast toward him. but the lord hath been said, becouse he i any work it with thyer hist all the dreases of all thou, that i may save him as the lord thy god, they shall save thee and i will give thee the son of the lions and the less of the house. and they shall not bring to thee that which they have done and the people of the city, whom the lord shall send them up, and the still within thy selvents, with their flocks they will be wounded and i will say to thee, they sheep and he shall die; if the workers of israel said unto them, teach me i have cast thy father s charge in thy salvation. and when thou shalt say unto them, why 