Import libraries

In [15]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer

Load a dataset of sentences

In [16]:
with open("dataset.txt", 'r') as file:
        raw_text = file.read()
raw_text = raw_text.replace("\n", " ")

The vocabulary is built using the words in the given dataset

The dataset is modified such that start (<span style="color:red">"\<s\>"</span>) and end (<span style="color:red">"\</s\>"</span>) tokens are wrapped around each sentence.

The tokens are also added to the vocabulary.


In [17]:
def add_sentence_tokens(tokens, start_token, stop_token, punctuations):
    """ Adds start and end tokens to the beginning and end of each sentence.
    Args:
        tokens (list): tokenized dataset
        start_token (str)
        stop_token (str)
    Returns:
        list: modified tokenized dataset
    """
    result = [start_token]
    for token in tokens:
        result.append(token)
        if token in punctuations:
            result.append(stop_token)
            result.append(start_token)
    result.append(stop_token)
    return result

In [18]:
start_token, stop_token = "<s>", "</s>"
punctuations = ['.', '!', '?']
tokenizer = get_tokenizer('basic_english')
dataset = add_sentence_tokens(tokenizer(raw_text), start_token, stop_token, punctuations)
vocab = sorted(list(set(dataset)))

The TokenLevelRNN class includes the implementation of the language model together with methods used in training the model and making predictions.

In [19]:
class TokenLevelRNN(nn.Module):
    """ RNN-based token level language model
        The developed model generates sentences one word at a time
    """
    def __init__(self, vocab, embedding_size, num_hidden_layers, hidden_layer_size, start_token, stop_token, tokenizer, punctuations):
        nn.Module.__init__(self)
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.embedding = nn.Embedding(self.vocab_size, embedding_size)        
        self.rnn = nn.RNN(embedding_size, hidden_layer_size, batch_first=True, num_layers= num_hidden_layers, dropout = 0.2)
        self.fully_connected = nn.Linear(hidden_layer_size, self.vocab_size)
        self.token_to_index_map = {vocab[i]:i for i in range(len(vocab))}
        self.index_to_token_map = vocab
        self.tokenizer = tokenizer
        self.start_token = start_token
        self.stop_token = stop_token
        self.punctuations = ",.?!:'"

    
    def forward(self, x, hidden = None):
        """ Neural network forward pass
        Args:
            x(torch.Tensor): input data
            hidden(torch.Tensor): hidden state
        Returns:
            tuple: logits and the last calculated hidden state
            
        """
        x = self.embedding(x)
        output, hidden = self.rnn(x, hidden)
        logits = self.fully_connected(output)
        return logits, hidden
        

    def encode(self, text):
        """ Converts string to a list of tokens' indices in the vocabulary
        Args:
            text (str): text to be processed
        Returns:
            list: list of indices
        """
        return [self.token_to_index_map[token] for token in text]

    
    def decode(self, indices):
        """ Converts indices to corresponding tokens
        Args:
            indices(list): list of indices
        Returns:
            str: Decoded string
        """
        return [self.index_to_token_map[index] for index in indices]

    
    def __generate_batch(self, data, seq_length, batch_size):
        """ Randomly samples data and generate a batch
        Args:
            data: dataset to be sampled
            seq_length(int): length of each sampled sequence
            batch_size(int): batch size
        Returns:
            tuple: a batch of data and labels
        """
        batch_start_indices = torch.randint(len(data) - seq_length, (batch_size,))
        x = torch.stack([data[i:i+seq_length] for i in batch_start_indices])
        y = torch.stack([data[i+1:i+seq_length+1] for i in batch_start_indices])
        return x, y
    

    def train_model(self, data, epochs, optimizer, criterion, seq_length = 50, batch_size = 32, loss_print_interval = 1000):
        """ Trains the weigths of the model
        Args:
            data(str): training data
            epochs: number of epochs
            optimizer: e.g., torch.optim.Adam
            criterion: e.g., nn.torch.CrossEntropyLoss
            seq_length(int): length of each randomly sampled data sequence
            batch_size(int): size of randomly sampled batch of data per each epoch
            loss_print_interval (int): the interval to print the loss during training
        Returns:
            None
        """
        data = torch.tensor(self.encode(data), dtype = torch.long)
        for epoch in range(1, epochs+1):
            self.train()
            x_batch, y_batch = self.__generate_batch(data, seq_length, batch_size)
            logits, _ = self.forward(x_batch)
            loss = criterion(logits.view(-1, self.vocab_size), y_batch.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if epoch == 1 or epoch % loss_print_interval == 0:
                print(f"Epoch {epoch}, Loss: {loss.item():.4f}")


    def __calculate_hidden_state(self, input_seq):
        """ Calculates hidden state (which is used to generate the next character given a sequence)
        Args:
            input_seq(torch.Tensor): sequence of input
        Returns:
            torch.Tensor: last calculated hidden state
        """
        if input_seq.nelement() == 0:
            return None
        _, hidden = self.forward(input_seq, None)
        return hidden.squeeze(1)


    def __convert_tokens_to_sentences(self, tokens):
        """ Converts list of tokens to sentences
        Args:
            tokens (list): list of tokens
        Returns:
            str: the sentences formed by input tokens
        """
        ans = []
        for token in tokens:
            if token in self.punctuations:
                ans.append(token)
            elif token in [self.start_token, self.stop_token]:
                continue
            else:
                ans.append(" " + token)
        return "".join(ans).strip().replace("' ", "'")


    def generate_sentence(self, start_text = "", temperature = 1.0, expected_sentences_count = 5):
        """ Generates sentences word by word given a start text. Uses softmax, and samples a word randomly
        Args:
            start_text(str): start_text of the expected paragraph
            temperature(float): temperature scaling applied to logits 
                temperature<1 -> more confident sampling (peaky distribution)
                temperature=1 -> normal sampling
                temperature>1 -> more creative sampling (flatter distribution)
        Returns:
            str: generated sentences
        """
        generated_sentences_count = 0
        tokenized_start_text = self.tokenizer(start_text)
        if not tokenized_start_text or tokenized_start_text[0] != self.start_token:
            tokenized_start_text = [self.start_token] + tokenized_start_text
        indices   = self.encode(tokenized_start_text)
        input_seq = torch.tensor(indices, dtype=torch.long).unsqueeze(0)
        hidden = self.__calculate_hidden_state(input_seq[:,:-1])
        self.eval()
        while True:
            current_index = input_seq[:,-1]
            logits, hidden = self.forward(current_index, hidden)
            probs = torch.softmax(logits/temperature, dim=-1)
            predicted_index = torch.multinomial(probs, num_samples=1).item()
            if self.decode([predicted_index])[0] == self.stop_token:
                generated_sentences_count += 1
                if generated_sentences_count == expected_sentences_count:
                    break
            input_seq = torch.cat([input_seq, torch.tensor([[predicted_index]])], dim=1)
        tokens = self.decode(input_seq.squeeze().tolist())
        return self.__convert_tokens_to_sentences(tokens)

Creating an object of the TokenLevelRNN class (embedding size = 200; hidden layer size = 1024, num_hidden_layers = 2)

In [20]:
token_level_model = TokenLevelRNN(
                                    vocab = vocab, embedding_size = 200, num_hidden_layers = 2,
                                    hidden_layer_size = 1024, start_token = start_token, 
                                    stop_token = stop_token, tokenizer = tokenizer, punctuations = ",.?!:'"
                                 )

Defining optimizer and loss and training model

In [21]:
optimizer = optim.Adam(token_level_model.parameters(), 0.002)
criterion = nn.CrossEntropyLoss()
token_level_model.train_model(
                                data = dataset, epochs = 1000, optimizer = optimizer, criterion = criterion
                                , seq_length = 50, batch_size = 256, loss_print_interval = 100
                             )

Epoch 1, Loss: 8.6943
Epoch 100, Loss: 3.2874
Epoch 200, Loss: 2.0825
Epoch 300, Loss: 1.2689
Epoch 400, Loss: 0.7497
Epoch 500, Loss: 0.4755
Epoch 600, Loss: 0.3710
Epoch 700, Loss: 0.3199
Epoch 800, Loss: 0.3100
Epoch 900, Loss: 0.2862
Epoch 1000, Loss: 0.2826


<span style = "color:blue">Example 1:</span> Generating words without any starting text

In [25]:
ans = token_level_model.generate_sentence("", expected_sentences_count = 5)
print(ans)

i dreamed a giant called hagrid came to tell me i was going to a school for wizards. when i open my eyes i'll be at home in, as if they're not wanted at home. he was looking over at harry as he spoke. crabbe and goyle chuckled. it, as if snape had started handing out sweets.


<span style = "color:blue">Example 2:</span> Generating sentences first of which starts with <span style = "color:blue"> "Harry was sad because he could not remember"</span>

In [23]:
ans = token_level_model.generate_sentence("Harry was sad because he could not remember", expected_sentences_count = 5)
print(ans)

harry was sad because he could not remember. finally he said slowly, so me, now. a braver man than vernon dursley would have quailed under the furious look hagrid now wouldn't be allowed to. bill norbert was about quirrell sometimes there was something else to see what he had done in the walls of books. it was only then that harry realized what was standing behind quirrell.


<span style = "color:blue">Example 3:</span> Generating sentences first of which is <span style = "color:blue"> "Ron was excited about"</span>

In [27]:
ans = token_level_model.generate_sentence("Ron was excited about", expected_sentences_count = 5)
print(ans)

ron was excited about the owls or the bludgers unless they crack my head open. don't worry, i'm going to drag harry away. sometimes, he'd never been more nervous, never, not even when he'd had to take a school report home to the dursleys saying that he'd somehow turned his teacher's wig blue. he kept his eyes fixed on the door. any second now, professor mcgonagall would come back and lead him to his doom.
