# Advanced Methods in Text Analytics
# Exercise 4: Language Models - Part 2
### Daniel Ruffinelli
## FSS 2025

## 1. N-Gram Language Models

In [1]:
# we define a simple whitespace tokenizer that removes punctuation
def tokenize(text):
    """ 
    Given text, returns all words separated by white space after separating all
    punctuation.

    Args:
        text: string with text to tokenize

    Returns:
        list of tokens
    """

    import string

    # separate punctuation symbols with whitespaces
    for symbol in string.punctuation:
        text = text.replace(symbol, " " + symbol + " ")
    text_split = text.split()

    return text_split

In [2]:
# we test the tokenizer
text = "This is a phrase, with some punctuation, that we want to tokenize!"
print(tokenize(text))
# output should be:
# ['This', 'is', 'a', 'phrase', ',', 'with', 'some', 'punctuation', ',', 'that', 'we', 'want', 'to', 'tokenize', '!']

['This', 'is', 'a', 'phrase', ',', 'with', 'some', 'punctuation', ',', 'that', 'we', 'want', 'to', 'tokenize', '!']


### Question (a)

In [4]:
# function for counting n-grams
def compute_ngrams(n, tokenized_text):
    """ 
    Compute n-grams in given list of tokens.

    Args:
        n: size of n-grams
        tokens: list of tokens

    Returns
        list of n-grams of the form (context, next_word), where context is a 
        tuple of n -1 previous words.
    """

    # we add left-side padding (i.e. start of sentence symbols)
    tokens = (n-1)*["<s>"] + tokenized_text

    # list to store ngrams
    ngrams = []

    ### WRITE YOUR CODE HERE ###

    # iterate over sequence of tokens
    for i in range(n-1, len(tokens)):
        # iterate over context (n-1 previous words for current token)
        context = []
        for j in reversed(range(n-1)):
            context.append((tokens[i-j-1]))
        ngrams.append((tuple(context), tokens[i]))

    return ngrams

In [5]:
# test your ngram computation
text = "This is a phrase, with some punctuation, that we want to tokenize!"
print(compute_ngrams(3, tokenize(text)))
# output should be:
# [(('<s>', '<s>'), 'This'), (('<s>', 'This'), 'is'), (('This', 'is'), 'a'), (('is', 'a'), 'phrase'), (('a', 'phrase'), ','), (('phrase', ','), 'with'), ((',', 'with'), 'some'), (('with', 'some'), 'punctuation'), (('some', 'punctuation'), ','), (('punctuation', ','), 'that'), ((',', 'that'), 'we'), (('that', 'we'), 'want'), (('we', 'want'), 'to'), (('want', 'to'), 'tokenize'), (('to', 'tokenize'), '!')]

[(('<s>', '<s>'), 'This'), (('<s>', 'This'), 'is'), (('This', 'is'), 'a'), (('is', 'a'), 'phrase'), (('a', 'phrase'), ','), (('phrase', ','), 'with'), ((',', 'with'), 'some'), (('with', 'some'), 'punctuation'), (('some', 'punctuation'), ','), (('punctuation', ','), 'that'), ((',', 'that'), 'we'), (('that', 'we'), 'want'), (('we', 'want'), 'to'), (('want', 'to'), 'tokenize'), (('to', 'tokenize'), '!')]


### Question (b)

In [6]:
# define an n-gram model that keeps track of "context to next word" pairs
# and an ngram counter
from collections import defaultdict as ddict 

class NGramModel:
    """
    NGramModel class, keeps track of (context, next_word) pairs and an ngram
    counter.
    """

    def __init__(self, n):

        self._n = n

        # set to store vocabulary
        self.vocabulary = set()

        # dict to store ngram counters
        self.ngram_counter = ddict(int)

        # dict to track (context, next_word) pairs
        self.context = ddict(list)

    def extend_vocabulary(self, tokenized_text):
        """ 
        Adds tokens to vocabulary. Useful for smoothing for unseen data.
        """
        for token in tokenized_text:
            self.vocabulary.add(token)

    def update(self, tokenized_text):
        """ 
        Updates counts of ngram model by computing ngrams in given tokenized 
        text and adding them to ngram counter and context tracker.

        Args:
            text (str): text to tokenize and compute ngrams
        """

        # compute ngrams
        ngrams = compute_ngrams(self._n, tokenized_text)

        ### WRITE YOUR CODE HERE ###

        # update vocabulary
        self.extend_vocabulary(tokenized_text)

        # update model parameters
        for ngram in ngrams:
            # update ngram counts
            self.ngram_counter[ngram] += 1

            # update context tracking
            context, next_word = ngram
            self.context[context].append(next_word)

    def next_word_prob(self, context, next_word):
        """
        Returns probability of predicting next_word after given context.
        We use add-1 smoothing to process unseen words.

        Args:
            context (list of tokens): context words
            next_word (str): next word

        Returns:
            prob (float): probability of next_word following context.
        """

        # probability value to return
        prob = 0.

        ### WRITE YOUR CODE HERE ###

        try:
            # number of times we saw next_word follow context
            numerator = self.ngram_counter[(context, next_word)] + 1
        except KeyError:
            numerator = 1

        # number of times we saw context
        denominator = len(self.context[context]) + len(self.vocabulary)

        # compute prob
        prob = numerator/denominator

        return prob

In [7]:
# test your NGram model
bigram_model = NGramModel(2)
print("Size on n-gram:", bigram_model._n)
# output should be: Size on n-gram: 2 

Size on n-gram: 2


In [8]:
# test your update function
bigram_model.update(tokenize("We now need text! Let's add some text to our bigram model"))
print("Vocabulary:", bigram_model.vocabulary)
print("N-Gram Counter:", bigram_model.ngram_counter)
print("Context Dict:", bigram_model.context)
# output should be:
# Vocabulary: {'s', '!', 'model', 'to', 'Let', 'add', "'", 'bigram', 'need', 'now', 'We', 'text', 'our', 'some'}
# N-Gram Counter: defaultdict(<class 'int'>, {(('<s>',), 'We'): 1, (('We',), 'now'): 1, (('now',), 'need'): 1, (('need',), 'text'): 1, (('text',), '!'): 1, (('!',), 'Let'): 1, (('Let',), "'"): 1, (("'",), 's'): 1, (('s',), 'add'): 1, (('add',), 'some'): 1, (('some',), 'text'): 1, (('text',), 'to'): 1, (('to',), 'our'): 1, (('our',), 'bigram'): 1, (('bigram',), 'model'): 1})
# Context Dict: defaultdict(<class 'list'>, {('<s>',): ['We'], ('We',): ['now'], ('now',): ['need'], ('need',): ['text'], ('text',): ['!', 'to'], ('!',): ['Let'], ('Let',): ["'"], ("'",): ['s'], ('s',): ['add'], ('add',): ['some'], ('some',): ['text'], ('to',): ['our'], ('our',): ['bigram'], ('bigram',): ['model']})

Vocabulary: {'!', 'bigram', 'our', 'model', 'text', 'add', 'some', 'to', 's', 'now', 'Let', 'We', 'need', "'"}
N-Gram Counter: defaultdict(<class 'int'>, {(('<s>',), 'We'): 1, (('We',), 'now'): 1, (('now',), 'need'): 1, (('need',), 'text'): 1, (('text',), '!'): 1, (('!',), 'Let'): 1, (('Let',), "'"): 1, (("'",), 's'): 1, (('s',), 'add'): 1, (('add',), 'some'): 1, (('some',), 'text'): 1, (('text',), 'to'): 1, (('to',), 'our'): 1, (('our',), 'bigram'): 1, (('bigram',), 'model'): 1})
Context Dict: defaultdict(<class 'list'>, {('<s>',): ['We'], ('We',): ['now'], ('now',): ['need'], ('need',): ['text'], ('text',): ['!', 'to'], ('!',): ['Let'], ('Let',): ["'"], ("'",): ['s'], ('s',): ['add'], ('add',): ['some'], ('some',): ['text'], ('to',): ['our'], ('our',): ['bigram'], ('bigram',): ['model']})


In [9]:
# test your next_word_prob function
print("Probability that word 'to' follows word 'text':", bigram_model.next_word_prob(("text",), "to"))
# output should be:
# Probability that word 'to' follows word 'text': 0.125

Probability that word 'to' follows word 'text': 0.125


### Question (c)

In [13]:
# define function to predict next word given context
def predict_next_word(context, model):
    """
    Uses given model to predict the next word that could follow the given 
    context. Each possible next word is chosen based on its probability given
    by the model.
    """

    import random

    # get all possible next words
    next_words = model.context[context]
    next_words_probs = []

    ### WRITE YOUR CODE HERE ###

    # get probabilities of each next_word
    for next_word in next_words:
        next_words_probs.append(model.next_word_prob(context, next_word))
    # normalize probs
    prob_mass = 0
    for prob in next_words_probs:
        prob_mass += prob
    for i, prob in enumerate(next_words_probs):
        next_words_probs[i] = next_words_probs[i] / prob_mass

    # select one of possible next word to predict
    predicted_word = random.choices(next_words, next_words_probs, k=1)

    return predicted_word[0]


In [21]:
# test your function predict_next_word
predict_next_word(("text",), bigram_model)
# output should be either '!' or 'to'

'!'

### Question (d) 

In [23]:
# load shakespeare's entire works
with open("shakespeare_train.txt") as f:
    corpus = f.read()

# break it into sentences
corpus_sentences = corpus.split(".")

In [24]:
# let's create a trigram model
trigram_model = NGramModel(3)

# feed corpus to model
for sentence in corpus_sentences:
        trigram_model.update(tokenize(sentence))


In [26]:
# check most common ngrams
top = 100
sorted(trigram_model.ngram_counter.items(), key=lambda x:x[1], reverse=True)[:top]

[((('<s>', '<s>'), 'I'), 4293),
 ((('I', "'"), 'll'), 1634),
 ((('<s>', '<s>'), 'The'), 1547),
 ((('<s>', '<s>'), 'What'), 1510),
 ((('<s>', '<s>'), '['), 1500),
 ((('<s>', '<s>'), 'O'), 1342),
 ((('<s>', '<s>'), "'"), 1214),
 ((('<s>', '<s>'), 'And'), 1087),
 ((('<s>', '<s>'), 'But'), 1041),
 ((('<s>', '<s>'), 'Enter'), 1018),
 ((("'", 'd'), ','), 1002),
 ((('<s>', '<s>'), 'You'), 923),
 (((',', 'my'), 'lord'), 878),
 ((('<s>', '<s>'), 'My'), 873),
 ((('<s>', '<s>'), 'If'), 869),
 ((('<s>', '<s>'), 'He'), 809),
 ((('<s>', '<s>'), 'A'), 807),
 ((('<s>', '<s>'), 'Exeunt'), 806),
 ((('<s>', '<s>'), 'Why'), 778),
 ((('<s>', '<s>'), 'KING'), 765),
 (((',', 'sir'), ','), 757),
 ((('<s>', '<s>'), 'Exit'), 688),
 ((('<s>', '<s>'), 'No'), 665),
 ((('<s>', '<s>'), 'Come'), 662),
 ((('<s>', 'O'), ','), 640),
 ((('<s>', '<s>'), 'This'), 625),
 ((('<s>', '<s>'), 'How'), 610),
 ((('<s>', '<s>'), 'Ay'), 559),
 ((('<s>', '<s>'), 'Now'), 556),
 ((('<s>', '<s>'), 'It'), 550),
 ((('<s>', 'Why'), ','), 5

### Question (e)

In [27]:
# we define function to generate n tokens given starting token
def generate_text(num_tokens, context, model):
    """
    Generate num_tokens words using given model

    Args:
        num_tokens (int): number of tokens to generate
        context (tuple of previous words): starting prompt
        model (NGramModel): ngram model
    """

    output = []

    ### WRITE YOUR CODE HERE ###

    for word in context:
        output.append(word)
    for i in range(num_tokens):
        output.append(predict_next_word(context, model))
        context = tuple(list(context[1:]) + [output[-1]])

    return " ".join(output)

In [30]:
# predict next words
context = ("My", "lord")

# test text generation
# this may fail at times because our simple model does not have an entry for the
# given context in its {context: next_word} dict, which in turn happens because
# the generated context (prompt) is a combination the model has not seen
# In short, it's a problem of our implementation that uses a dict
# In practice, it's not often done like this (we'll see in the next tasks)
print(generate_text(40, context, trigram_model))

My lord , I ' ll speak with her , and that ' s eyes , And in the advantage of the city , and I ' ll give thee all my heart , to the King , Nor thou with thy


### Question (f)

In [31]:
# let's try stronger models
fourgram_model = NGramModel(4)

# feed corpus to model
# fourgram_model.update(tokenize(corpus))
for sentence in corpus_sentences:
        fourgram_model.update(tokenize(sentence))

# check most common ngrams
top = 10
sorted(fourgram_model.ngram_counter.items(), key=lambda x:x[1], reverse=True)[:top]

[((('<s>', '<s>', '<s>'), 'I'), 4293),
 ((('<s>', '<s>', '<s>'), 'The'), 1547),
 ((('<s>', '<s>', '<s>'), 'What'), 1510),
 ((('<s>', '<s>', '<s>'), '['), 1500),
 ((('<s>', '<s>', '<s>'), 'O'), 1342),
 ((('<s>', '<s>', '<s>'), "'"), 1214),
 ((('<s>', '<s>', '<s>'), 'And'), 1087),
 ((('<s>', '<s>', '<s>'), 'But'), 1041),
 ((('<s>', '<s>', '<s>'), 'Enter'), 1018),
 ((('<s>', '<s>', '<s>'), 'You'), 923)]

In [32]:
# predict next words
context = ("I", "'", "ll")

# test text generation
print(generate_text(20, context, fourgram_model))

I ' ll be with you , sir , to speak more properly , stays me at home ; For by this knot


In [33]:
# and a 5-gram models
fivegram_model = NGramModel(5)

# feed corpus to model
# fivegram_model.update(tokenize(corpus))
for sentence in corpus_sentences:
        fivegram_model.update(tokenize(sentence))

# check most common ngrams
top = 10
sorted(fivegram_model.ngram_counter.items(), key=lambda x:x[1], reverse=True)[:top]


[((('<s>', '<s>', '<s>', '<s>'), 'I'), 4293),
 ((('<s>', '<s>', '<s>', '<s>'), 'The'), 1547),
 ((('<s>', '<s>', '<s>', '<s>'), 'What'), 1510),
 ((('<s>', '<s>', '<s>', '<s>'), '['), 1500),
 ((('<s>', '<s>', '<s>', '<s>'), 'O'), 1342),
 ((('<s>', '<s>', '<s>', '<s>'), "'"), 1214),
 ((('<s>', '<s>', '<s>', '<s>'), 'And'), 1087),
 ((('<s>', '<s>', '<s>', '<s>'), 'But'), 1041),
 ((('<s>', '<s>', '<s>', '<s>'), 'Enter'), 1018),
 ((('<s>', '<s>', '<s>', '<s>'), 'You'), 923)]

In [43]:
# predict next words
context = ("<s>", "<s>", "<s>", "<s>")

# test text generation
print(generate_text(20, context, fivegram_model))

<s> <s> <s> <s> I ' ll give you a pottle of burnt sack to give me recourse to him , and will make


### Question (g)

In [44]:
# compute likelihood of validation data
def compute_likelihood(tokenized_text, model, n):
    """
    Compute log_likelihood of given tokenized text using given model.
    We compute likelihood in log space to prevent numerical underflow.

    Args:
        tokenized_text (list of tokens): text to compute likelihood of
        model (NGramModel): model used to compute likelihood
        n (int): size of n-grams of givem model
    """

    import math

    log_likelihood = 0.0

    ### WRITE YOUR CODE HERE ###

    for i, token in enumerate(tokenized_text[n:], start=n):
        context = tuple(tokenized_text[i-n+1:i])
        log_likelihood += math.log(model.next_word_prob(context, token))

    return log_likelihood, math.exp(-log_likelihood / i)

In [45]:
# load shakespeare's validation data
with open("shakespeare_valid.txt") as f:
    corpus_valid = f.read()

# tokenize it
tokenized_validation = tokenize(corpus_valid)

print(tokenized_validation[:100])

['1605', 'THE', 'TRAGEDY', 'OF', 'OTHELLO', ',', 'MOOR', 'OF', 'VENICE', 'by', 'William', 'Shakespeare', 'Dramatis', 'Personae', 'OTHELLO', ',', 'the', 'Moor', ',', 'general', 'of', 'the', 'Venetian', 'forces', 'DESDEMONA', ',', 'his', 'wife', 'IAGO', ',', 'ensign', 'to', 'Othello', 'EMILIA', ',', 'his', 'wife', ',', 'lady', '-', 'in', '-', 'waiting', 'to', 'Desdemona', 'CASSIO', ',', 'lieutenant', 'to', 'Othello', 'THE', 'DUKE', 'OF', 'VENICE', 'BRABANTIO', ',', 'Venetian', 'Senator', ',', 'father', 'of', 'Desdemona', 'GRATIANO', ',', 'nobleman', 'of', 'Venice', ',', 'brother', 'of', 'Brabantio', 'LODOVICO', ',', 'nobleman', 'of', 'Venice', ',', 'kinsman', 'of', 'Brabantio', 'RODERIGO', ',', 'rejected', 'suitor', 'of', 'Desdemona', 'BIANCA', ',', 'mistress', 'of', 'Cassio', 'MONTANO', ',', 'a', 'Cypriot', 'official', 'A', 'Clown', 'in', 'service']


In [46]:
# extend trigram model vocabulary for proper smoothing
trigram_model.extend_vocabulary(tokenized_validation)

#  compute validation likelihood
valid_likelihood, perplexity = compute_likelihood(tokenize(corpus_valid), trigram_model, 3)
print("Validation likelihood:", valid_likelihood)
print("Validation perplexity:", perplexity)

Validation likelihood: -354329.59077216004
Validation perplexity: 16012.682762755616


In [52]:
# vocabulary size
print("Vocabulary size:", len(trigram_model.vocabulary))

Vocabulary size: 28858


## 2. Language Models with Fully-Connected Neural Networks

In [3]:
import torch

# set device to "cpu" if you don't have a GPU
DEVICE="cpu"

# test it
a = torch.rand(3,3).to(DEVICE)
print(a)

tensor([[0.7885, 0.5821, 0.4990],
        [0.9641, 0.7604, 0.6491],
        [0.1664, 0.5281, 0.4575]])


In [4]:
# let's preprocess our text (we work with embeddings now, not just strings)
from collections import defaultdict as ddict

# these are our splits
shakespeare_splits = {
    "train": "shakespeare_train.txt", 
    "valid": "shakespeare_valid.txt", 
    "text": "shakespeare_test.txt"
}

# we create a vocabulary dict of the form {token: ID}
shakespeare_vocab = {}
for text_file in shakespeare_splits.values():
    with open(text_file) as f:
        split_text = f.read()
        tokenized_split = tokenize(split_text)
        for token in tokenized_split:
            if token not in shakespeare_vocab:
                shakespeare_vocab[token] = len(shakespeare_vocab)
# we add the padding symbol to our vocabulary
shakespeare_vocab["<s>"] = len(shakespeare_vocab)
print("Size of vocabulary:", len(shakespeare_vocab))

# we turn our splits into sequences of token IDs
shakespeare_splits_ids = ddict(list)
for split_id, split_file in shakespeare_splits.items():
    with open(split_file) as f:
            tokenized_split = tokenize(f.read())
    for token in tokenized_split:
        shakespeare_splits_ids[split_id].append(shakespeare_vocab[token])


Size of vocabulary: 29245


### Question (a)

In [5]:
from torch import nn

class NeuralLM(nn.Module):

    def __init__(self, 
                 vocabulary_size, 
                 embedding_size,
                 max_input_length,
                 padding_token_id,
                 hidden_layer_sizes,
                 hidden_layer_activation="tanh"):
        """
        Neural language model.

        Args:
            vocabulary_size (int): size of vocabulary
            embedding size (int): size of input tokens
            max_input_length (int): max. number of input tokens
            padding_token_id (int): id of token to use for left padding
            hidden_layer_sizes (list): list of hidden layer sizes, e.g. [10, 5]
            hidden_layer_activation (string): activation, e.g. sigmoid, tanh
        """
        super().__init__()

        # set hyperparameters
        self._max_input_length = max_input_length
        self._pad_token_id = torch.tensor(padding_token_id).to(DEVICE)
        if hasattr(torch, hidden_layer_activation):
            self._activation = getattr(torch, hidden_layer_activation)
        else:
            raise ValueError("Activation must be a torch-supported function.")

        # create embedding matrix
        self._embeddings = nn.Embedding(vocabulary_size, embedding_size)

        # create hidden layers    
        hidden_layers = []
        input_size = embedding_size * max_input_length
        for output_size in hidden_layer_sizes:
            hidden_layers.append(
                nn.Linear(input_size, output_size).to(DEVICE)
                )
            input_size = output_size
        # see why we require a ModuleList: 
        # https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html
        self._hidden_layers = nn.ModuleList(hidden_layers)
        
        # create output layer (no need for softmax, we'll do that with the loss)
        self._output_layer = nn.Linear(output_size, vocabulary_size)

    def forward(self, input_ids):
        """
        Forward pass.

        Args:
            seq_indices (tensor): tensor of token IDs of size 
            (batch_size, max_input_length)

        Return:
            out (tensor): tensor of size [len(input_ids), vocabulary_size]
        """

        # pad to the left
        num_pads = self._max_input_length - input_ids.size()[1]
        padding_tensor = self._pad_token_id.expand(input_ids.size()[0])
        padding_tensor = torch.unsqueeze(padding_tensor, 1).expand(-1, num_pads)
        input_ids = torch.cat(
            [padding_tensor, input_ids],
            dim=1
        )

        ### WRITE YOUR CODE HERE ###

        # embed input sequence  
        # input_ids should be batch_size * max_input_length, but we need to
        # get the embeddings for each token in every sequence in the batch.
        # So, we first flatten the input_ids, use the flattened indices to get
        # the corresponding embeddings for every word in every example in the 
        # batch, then reshape back to batch_size * (emb_size * input_length)
        embs = self._embeddings(
                input_ids.view(-1)
            ).view(input_ids.size()[0], -1)
        out = embs
        # compute hidden layers
        for hidden_layer in self._hidden_layers:
            out = hidden_layer(out)
            out = self._activation(out)
        # compute output layer
        out = self._output_layer(out)

        return out
        

In [6]:
# test your NeuralLM
embedding_size = 16
hidden_layer_size = 32
max_input_length = 64
neural_lm_1 = NeuralLM(
    len(shakespeare_vocab),
    embedding_size=embedding_size,
    max_input_length=max_input_length,
    padding_token_id=shakespeare_vocab["<s>"],
    hidden_layer_sizes=[hidden_layer_size]
).to(DEVICE)
print(neural_lm_1)

# output should be:
# NeuralLM(
#   (_embeddings): Embedding(29245, 16)
#   (_output_layer): Linear(in_features=32, out_features=29245, bias=True)
# )

NeuralLM(
  (_embeddings): Embedding(29245, 16)
  (_hidden_layers): ModuleList(
    (0): Linear(in_features=1024, out_features=32, bias=True)
  )
  (_output_layer): Linear(in_features=32, out_features=29245, bias=True)
)


### Question (b)

In [7]:
# create torch dataset class
from torch.utils.data import Dataset

class SelfSupervisedTextDataset(Dataset):

    def __init__(self, tokenized_text, example_length):
        """
        Dataset to process text examples constructed with self-supervision.

        Args:
            tokenized_text (string): list of tokens to construct examples
            example_length (int): length of inputs strings for model
        """

        # we divide tokenized text into subsequences of (equal) example_length
        # we ignore leftover tokens at the end
        self._examples = []

        ### WRITE YOUR CODE HERE ###

        for i in range(0, len(tokenized_text), example_length):
            self._examples.append(tokenized_text[i:i + example_length])
        if len(self._examples[-1]) < example_length:
               self._examples.pop()

    def __len__(self):
        return len(self._examples)
    
    def __getitem__(self, idx):
        return self._examples[idx]


In [8]:
# create shakespeare dataset
training_dataset = SelfSupervisedTextDataset(shakespeare_splits_ids["train"], 
                                             max_input_length)
print("Num examples:", len(training_dataset))
print("Example length:", len(training_dataset[0]))
# output should be:
# Num examples: 16824
# Example length: 64

Num examples: 16824
Example length: 64


In [9]:
# create data loader
from torch.utils.data import DataLoader

def collate_fn(batch):
    """
    Function to construct labeled examples from given batch.

    Args:
        batch (tensor): tensor of size batch_size x sequence_length with tokens
    """

    # we create two lists for our training examples: inputs and corresponding 
    # targets    
    inputs = []
    targets = []

    ### WRITE YOUR CODE HERE ###

    for example in batch:
        inputs.append(torch.tensor(example[:-1]))
        targets.append(torch.tensor(example[-1]))

    return torch.stack(inputs, dim=0), torch.stack(targets, dim=0)

In [10]:
# test your collate_fn
# we create a toy batch of sequence_length = 4
toy_batch = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
print(collate_fn(toy_batch))
# output should be: 
# (tensor([[1, 2, 3],
#          [5, 6, 7]]), tensor([4, 8]))

(tensor([[1, 2, 3],
        [5, 6, 7]]), tensor([4, 8]))


  inputs.append(torch.tensor(example[:-1]))
  targets.append(torch.tensor(example[-1]))


In [11]:
# test your dataloader
batch_size = 128
training_dataloader = DataLoader(training_dataset, 
                                 collate_fn=collate_fn,
                                 batch_size=batch_size, 
                                 shuffle=True, 
                                 num_workers=0)
print(training_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x73d3a495ae20>


### Question (c)

In [12]:
# write training loop
import time, math

def train(model, dataloader, num_epochs=10, print_batch_stats=False, rnn=False):
    """
    Training loop

    Args:
        model: some LM implemented in PyTorch
        dataloader: dataloader that returns sentences as examples
        num_epochs (int): number of epochs to train 
    """

    # set training hyperparameters
    loss_fn = nn.CrossEntropyLoss()
    learning_rate = 0.1  # warm up and scheduler are tricks for this
    optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)

    # set model to train mode
    model.train()

    ### WRITE YOUR CODE HERE ###

    # we iterate over epochs
    num_batches = len(dataloader)
    for epoch in range(num_epochs):
        total_loss = 0.
        start_time = time.time()

        # we iterate over batches
        for batch_num, batch in enumerate(dataloader):

            # get inputs and targets from batch
            inputs = batch[0].to(DEVICE)
            targets = batch[1].to(DEVICE)

            # forward pass
            if not rnn:
                output = model(inputs)
                loss_value = loss_fn(output, targets)
            else:
                hidden = model.init_hidden(len(inputs))
                output, hidden = model(inputs, hidden)
                # output is [batch_size, input_length, embedding_size)
                # targets is [batch_size, input_length]
                # we need to match both to compute the loss
                loss_value = loss_fn(
                    output.view(targets.size()[0] * targets.size()[1], -1), 
                    targets.view(-1)
                )

            # backward pass
            # PyTorch sums up gradients that are computed in sequence
            # so unless we "erase" those gradients after every update, 
            # we will backpropagate through different batches
            # so we set them to zero every time
            # higher level libraries will allow you to control how often this
            # zero_grad function is used with the parameters accumulate_grads
            # e.g. accumulate_grads = 3 means that we zero_grad every 3 batches
            optimizer.zero_grad()
            # here we compute gradients
            loss_value.backward()
            # here we update model weights
            optimizer.step()

            total_loss += loss_value.item()

            # log batch stats    
            if print_batch_stats:
                print(f"| Batch {batch_num+1:6d}/{num_batches:6d} "
                      f"| Loss {loss_value:6.4f} "
                      f"| Batch PPL {math.exp(loss_value):8.2f}")
        
        # compute avg loss per batch
        avg_loss = total_loss / num_batches
        
        # compute perplexity where avg loss is likelihood (empirical risk)
        ppl = math.exp(avg_loss)

        # compute epoch time
        epoch_time = time.time() - start_time

        # log epoch stats
        print(
            f"| Epoch {epoch+1:2d}/{num_epochs:2d} | Epoch Time {epoch_time:5f} "
            f"| Avg Loss {avg_loss:6.4f} | PPL {ppl:8.2f}"
        )

        # reset total loss and timer
        total_loss = 0.
        start_time = time.time()


In [13]:
# test your training loop (loss and ppl should go down)
train(neural_lm_1, training_dataloader, num_epochs=20)

| Epoch  1/20 | Epoch Time 4.976456 | Avg Loss 7.3138 | PPL  1500.85
| Epoch  2/20 | Epoch Time 4.854805 | Avg Loss 6.0851 | PPL   439.25
| Epoch  3/20 | Epoch Time 4.461584 | Avg Loss 5.5492 | PPL   257.04
| Epoch  4/20 | Epoch Time 4.515143 | Avg Loss 5.1215 | PPL   167.59
| Epoch  5/20 | Epoch Time 4.641646 | Avg Loss 4.7727 | PPL   118.24
| Epoch  6/20 | Epoch Time 4.664999 | Avg Loss 4.4762 | PPL    87.90
| Epoch  7/20 | Epoch Time 4.635690 | Avg Loss 4.2159 | PPL    67.76
| Epoch  8/20 | Epoch Time 4.476098 | Avg Loss 3.9929 | PPL    54.21
| Epoch  9/20 | Epoch Time 4.640388 | Avg Loss 3.8011 | PPL    44.75
| Epoch 10/20 | Epoch Time 4.678479 | Avg Loss 3.6323 | PPL    37.80
| Epoch 11/20 | Epoch Time 4.674920 | Avg Loss 3.4835 | PPL    32.57
| Epoch 12/20 | Epoch Time 4.509139 | Avg Loss 3.3527 | PPL    28.58
| Epoch 13/20 | Epoch Time 4.645756 | Avg Loss 3.2357 | PPL    25.42
| Epoch 14/20 | Epoch Time 4.699732 | Avg Loss 3.1299 | PPL    22.87
| Epoch 15/20 | Epoch Time 4.64532

### Question (d)

In [14]:
# evaluate 
def evaluate(model, dataloader, print_batch_stats=False, rnn=False):
    """
    Evaluate model on given dataset.

    Args:
        model: some LM implemented in PyTorch
        dataloader: dataloader that returns sentences as examples
    """

    # we use cross entropy so we can compute perplexity from this
    # we sum loss up, to then divide by number of examples
    loss_fn = nn.CrossEntropyLoss(reduction="sum")

    # set model to eval mode (turns off dropout, etc.)
    model.eval()

    num_batches = len(dataloader)
    num_examples = 0
    total_loss = 0.

    ### WRITE YOUR CODE HERE ###

    with torch.no_grad():
        # we iterate over batches
        start_time = time.time()
        for batch_num, (inputs, targets) in enumerate(dataloader):

            # move inputs and targets to device
            inputs = inputs.to(DEVICE)
            targets = targets.to(DEVICE)

            # add up number of examples
            num_examples += len(inputs)

            # compute loss
            if not rnn:
                output = model(inputs)
                loss_value = loss_fn(output, targets)
            else:
                hidden = model.init_hidden(len(inputs))
                output, hidden = model(inputs, hidden)
                # output is [batch_size, input_length, embedding_size)
                # targets is [batch_size]
                # we want only predictions for last word
                loss_value = loss_fn(
                    output[:, -1, :],
                    targets.view(-1)
                )

            # add up loss
            total_loss += loss_value.item()

            # log batch stats    
            if print_batch_stats:
                print(f"| Batch {batch_num+1:6d}/{num_batches:6d} "
                        f"| Loss {loss_value:6.4f} "
                        f"| Batch PPL {math.exp(loss_value):8.2f}")
        
        # compute avg loss per batch
        avg_loss = total_loss / num_examples
        
        # compute perplexity where avg loss is likelihood (empirical risk)
        ppl = math.exp(avg_loss)

        # compute epoch time
        total_time = time.time() - start_time

        # log epoch stats
        print(
            f"| Run Time {total_time:5f} "
            f"| Avg Loss {avg_loss:6.4f} "
            f"| PPL {ppl:8.2f}"
        )


In [15]:
# create validation dataset
validation_dataset = SelfSupervisedTextDataset(shakespeare_splits_ids["valid"], 
                                               max_input_length)
print(validation_dataset)

<__main__.SelfSupervisedTextDataset object at 0x73d360cd7f40>


In [16]:
# create data loader for validation
batch_size = 128 
validation_dataloader = DataLoader(validation_dataset, 
                                 collate_fn=collate_fn,
                                 batch_size=batch_size, 
                                 shuffle=True, 
                                 num_workers=0)

In [17]:
# evaluate FNN
evaluate(neural_lm_1, validation_dataloader, rnn=False)

| Run Time 0.159164 | Avg Loss 8.7676 | PPL  6422.75


## 3. Language Models with RNNs

### Question (a)

In [19]:
# now an RNN-based language model

class RNNLM(nn.Module):

    def __init__(self,
                 vocabulary_size, 
                 embedding_size,
                 hidden_layer_size,
                 tie_weights=True,
                 num_hidden_layers=1,
                 hidden_layer_activation="tanh",
                 dropout_rate=0.0):
        """
        RNN-based language model.
        """

        super().__init__()
        self._num_layers = num_hidden_layers
        self._hidden_layer_size = hidden_layer_size
        self._embedding_size = embedding_size

        self._embeddings = nn.Embedding(vocabulary_size, embedding_size)
        self._rnn = nn.RNN(embedding_size, 
                           hidden_layer_size,
                           num_layers=num_hidden_layers, 
                           dropout=dropout_rate, 
                           nonlinearity=hidden_layer_activation,
                           batch_first=True)
        self.output_layer = nn.Linear(hidden_layer_size, vocabulary_size)

        if tie_weights:
            self._embeddings.weight = self.output_layer.weight

    def init_hidden(self, batch_size):
        """
        Initialize hidden states.

        Args:
            batch_size (int): batch size
        """

        hidden = torch.zeros(self._num_layers, 
                             batch_size, 
                             self._hidden_layer_size)

        return hidden.to(DEVICE)

    def forward(self, input_ids, hidden):
        """
        Forward pass.

        Args:
            input_ids (tensor): 3D tensors with batched data, or 2D tensor
            hidden (tensor): tensor with initial hidden states

        Return:
            out (tensor): tensor of size [batch size, seq length, vocab_size]
        """

        ### WRITE YOUR CODE HERE ###

        # embed input sequence (same as forward pass in NeuralLM)        
        embs = self._embeddings(
            input_ids.view(-1)).view(input_ids.size()[0], 
                                     input_ids.size()[1], 
                                     -1)
        # compute hidden states
        out, hidden_state = self._rnn(embs, hidden)          
        # compute output
        out = self.output_layer(out)
        
        return out, hidden_state
    

In [20]:
# test your RNN
embedding_size = 16
hidden_layer_size = 16
rnn_lm_1 = RNNLM(
    len(shakespeare_vocab),
    embedding_size=embedding_size,
    hidden_layer_size=hidden_layer_size,
).to(DEVICE)
print(rnn_lm_1)

# output should be:
# RNNLM(
#   (_embeddings): Embedding(29245, 16)
#   (_rnn): RNN(16, 16, batch_first=True)
#   (output_layer): Linear(in_features=16, out_features=29245, bias=True)
# )

RNNLM(
  (_embeddings): Embedding(29245, 16)
  (_rnn): RNN(16, 16, batch_first=True)
  (output_layer): Linear(in_features=16, out_features=29245, bias=True)
)


In [21]:
# we need a slightly different collate function for the RNN, if we want to train 
# with teacher forcing
def rnn_collate_fn(batch):
    """
    Function to construct labeled example from given batch.

    Args:
        batch (tensor): tensor of size batch_size x sentence_length with tokens
    """

    # we create two lists for our training examples: inputsand corresponding 
    # targets    
    inputs = []
    targets = []

    ### WRITE YOUR CODE HERE ###

    for example in batch:
        # RNN
        inputs.append(torch.tensor(example[:-1]))
        targets.append(torch.tensor(example[1:]))

    return torch.stack(inputs, dim=0), torch.stack(targets, dim=0)

In [22]:
# test your collate_fn
# we create a toy batch of sequence_length = 4
toy_batch = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])
print(rnn_collate_fn(toy_batch))

# output should be:
# (tensor([[1, 2, 3],
#          [5, 6, 7]]), 
#  tensor([[2, 3, 4],
#          [6, 7, 8]]))

(tensor([[1, 2, 3],
        [5, 6, 7]]), tensor([[2, 3, 4],
        [6, 7, 8]]))


  inputs.append(torch.tensor(example[:-1]))
  targets.append(torch.tensor(example[1:]))


In [23]:
# create a new dataloader with this new collate function
rnn_training_dataloader = DataLoader(training_dataset, 
                                 collate_fn=rnn_collate_fn,
                                 batch_size=batch_size, 
                                 shuffle=True, 
                                 num_workers=0)
print(rnn_training_dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x73d36038b310>


In [24]:
# train your RNN (DO NOT RUN THIS DURING CLASS!)
train(rnn_lm_1, rnn_training_dataloader, num_epochs=1, rnn=True)

| Epoch  1/ 1 | Epoch Time 343.757031 | Avg Loss 6.1271 | PPL   458.09


### Question (b) 

In [27]:
# evaluate RNN
# we use the same dataloader as with our NeuralLM model, so performance is 
# comparable, but the evaluate function must be able to handle both models
evaluate(rnn_lm_1, validation_dataloader, rnn=True)

| Run Time 3.508857 | Avg Loss 5.8659 | PPL   352.81


### Question (c)

In [28]:
# construct reverse dict to decode text generated with our RNN
reverse_shapeskeare_vocab = {}
for k, v in shakespeare_vocab.items():
    reverse_shapeskeare_vocab[v] = k

In [29]:
# generate text with RNN
def generate_text_with_rnn(num_tokens, 
                           context, 
                           model, 
                           temperature, 
                           decoding_dict):
    """
    Generate num_tokens given context and model.

    Args:
        num_tokens (int): number of tokens to be generated
        context (tensor): sequence of n token IDs in tensor of size [1, n]
        model (RNNLM): RNN model
        temperature (float): softmax temperature
        decoding_dict (dict): dict of the form {token_id: token}
    """

    predictions = []
    context = torch.unsqueeze(context, 0)
    with torch.no_grad():

        ### WRITE YOUR CODE HERE ###

        for _ in range(num_tokens):
            # batch size = 1 in this case
            hidden = model.init_hidden(1)
            output, hidden = model(context, hidden)
            # output is [batch_size, input_length, embedding_size)
            # we want only predictions for last word
            # so we apply softmax to logits of output corresponding to last word
            probs = torch.softmax(output[:, -1, :] / temperature, dim=-1)
            # get next word prediction from model's softmax distribution
            next_word = torch.multinomial(probs, num_samples=1).item()
            predictions.append(next_word)
            # to ensure autoregressive generation, we add the predicted word
            # to the context
            context = torch.cat(
                (context, torch.tensor([next_word]).unsqueeze(0)), 
                dim=1
                )

    # decode predictions
    output_tokens = []
    for pred in predictions:
        output_tokens.append(decoding_dict[pred])

    return " ".join(output_tokens)

In [40]:
# test text generation

# construct context
context_ids = [10, 11]
# context_ids = [110]
context = torch.tensor(context_ids)
# print context
print("PROMPT:")
for id in context_ids:
    print(reverse_shapeskeare_vocab[id])
print()

# generate!
# play around with temperature, higher values make distribution more uniform
# lower values puts more mass on already probably events
num_tokens = 20
temperature = .2
print("GENERATED TEXT:")
generate_text_with_rnn(num_tokens, 
                       context, 
                       rnn_lm_1, 
                       temperature, 
                       reverse_shapeskeare_vocab)


PROMPT:
we
desire

GENERATED TEXT:


", and , I will be the ' d , and a good , and a lord . I have"

### Question (d)

In [41]:
# here's a modified version of the generated_text_with_rnn function. It now
# takes as input a sample function and a value for k.
def generate_text_with_rnn_2(num_tokens, 
                           context, 
                           model, 
                           temperature, 
                           decoding_dict,
                           sampling_fn,
                           k):
    """
    Generate num_tokens given context and model.

    Args:
        num_tokens (int): number of tokens to be generated
        context (tensor): sequence of n token IDs in tensor of size [1, n]
        model (RNNLM): RNN model
        temperature (float): softmax temperature
        decoding_dict (dict): dict of the form {token_id: token}
        sampling_fn (callable): function to produce single sample given 
                                distribution
        k (int): number of top-k elements in distribution to sample from
    """

    predictions = []
    context = torch.unsqueeze(context, 0)
    with torch.no_grad():

        ### WRITE YOUR CODE HERE ###

        for _ in range(num_tokens):
            # batch size = 1 in this case
            hidden = model.init_hidden(1)
            output, hidden = model(context, hidden)
            # output is [batch_size, input_length, embedding_size)
            # we want only predictions for last word
            # we don't apply softmax here, as the sample function will do that
            logits = output[:, -1, :]
            # get next word prediction using given sample function
            next_word = sampling_fn(logits, k, temperature)
            predictions.append(next_word)
            # to ensure autoregressive generation, we add the predicted word
            # to the context
            context = torch.cat(
                (context, torch.tensor([next_word]).unsqueeze(0)), 
                dim=1
                )

    # decode predictions
    output_tokens = []
    for pred in predictions:
        output_tokens.append(decoding_dict[pred])

    return " ".join(output_tokens)

In [42]:
# top-k sampling (gives us greedy with k = 1 and random with k = |V|)
def topk_sampling(logits, k, temperature):
    """
    Top-k sampling, we get greedy sampling with k = 1 and random sampling with 
    k = |V|.

    Args:
        logits (tensor): tensor of unnormalized probabilities
        k (int): number of top-k elements in distribution to sample from
        temperature (float): softmax temperature
    """

    ### WRITE YOUR CODE HERE ###

    # get top-k elements
    topk = torch.topk(logits, k)
    # turn top-k elements into normalized distribution
    probs = torch.softmax(topk.values / temperature, dim=-1)

    # retun sample from top-k distribution
    return torch.multinomial(probs, num_samples=1).item()

In [52]:
# test text generation now with different sampling approaches

# construct context
context_ids = [10, 11]
# context_ids = [10, 11, 110]
# context_ids = [11]
# context_ids = [110]
context = torch.tensor(context_ids)
# print context
print("PROMPT:")
for id in context_ids:
    print(reverse_shapeskeare_vocab[id])
print()

# generate!
# in addition to temperature, play around with different values of k
num_tokens = 20
temperature = 0.9
k = 100
print("GENERATED TEXT:")
generate_text_with_rnn_2(num_tokens, 
                       context, 
                       rnn_lm_1, 
                       temperature, 
                       reverse_shapeskeare_vocab,
                       topk_sampling,
                       k
                       )


PROMPT:
we
desire

GENERATED TEXT:


"should SONNETS 1609 where 1609 thy thy Thou From 1 1609 fairest contracted ' s fairest SONNETS 1 1609 THE"

### Question (e)

In [46]:
# test your RNN
embedding_size = 16
hidden_layer_size = 16
rnn1 = RNNLM(
    len(shakespeare_vocab),
    embedding_size=embedding_size,
    hidden_layer_size=hidden_layer_size,
).to(DEVICE)
print(rnn1)

# output should be:
# RNNLM(
#   (_embeddings): Embedding(29245, 16)
#   (_rnn): RNN(16, 16, batch_first=True)
#   (output_layer): Linear(in_features=16, out_features=29245, bias=True)
# )

RNNLM(
  (_embeddings): Embedding(29245, 16)
  (_rnn): RNN(16, 16, batch_first=True)
  (output_layer): Linear(in_features=16, out_features=29245, bias=True)
)


In [48]:
# let's train longer to check convergence
train(rnn1, rnn_training_dataloader, num_epochs=20, rnn=True)

| Epoch  1/20 | Epoch Time 5.486800 | Avg Loss 6.1755 | PPL   480.84
| Epoch  2/20 | Epoch Time 5.453655 | Avg Loss 5.6731 | PPL   290.92
| Epoch  3/20 | Epoch Time 5.457234 | Avg Loss 5.5263 | PPL   251.22
| Epoch  4/20 | Epoch Time 5.454179 | Avg Loss 5.4471 | PPL   232.09
| Epoch  5/20 | Epoch Time 5.429858 | Avg Loss 5.3950 | PPL   220.31
| Epoch  6/20 | Epoch Time 5.437742 | Avg Loss 5.3579 | PPL   212.28
| Epoch  7/20 | Epoch Time 5.460291 | Avg Loss 5.3294 | PPL   206.32
| Epoch  8/20 | Epoch Time 5.425132 | Avg Loss 5.3069 | PPL   201.72
| Epoch  9/20 | Epoch Time 5.442552 | Avg Loss 5.2886 | PPL   198.06
| Epoch 10/20 | Epoch Time 5.456953 | Avg Loss 5.2739 | PPL   195.18
| Epoch 11/20 | Epoch Time 5.462910 | Avg Loss 5.2597 | PPL   192.42
| Epoch 12/20 | Epoch Time 5.552658 | Avg Loss 5.2494 | PPL   190.46
| Epoch 13/20 | Epoch Time 5.555686 | Avg Loss 5.2393 | PPL   188.54
| Epoch 14/20 | Epoch Time 5.550479 | Avg Loss 5.2301 | PPL   186.81
| Epoch 15/20 | Epoch Time 5.53464

In [49]:
# evaluate RNN
evaluate(rnn1, validation_dataloader, rnn=True)

| Run Time 0.056986 | Avg Loss 5.8776 | PPL   356.96
