## Neural and N-Gram Language Model for CPSC 503 Assignment 2
#### The neural model notebook is modified from Yunjey Choi's Github repository - pytorch-tutorial.
#### Familiarize yourself with pytorch, start with: https://pytorch.org/tutorials/beginner/basics/intro.html

#### The N-gram model notebook is from Josh Loehr's Github repository - ngram-language-model.
#### https://github.com/joshualoehr/ngram-language-model

#### ========================================================================================

### Let's load a number of dependencies:

In [1]:
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
import numpy as np
import os
import math
from collections import defaultdict
from tqdm import tqdm

# check if GPU is available to pytorch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Here are two classes needed for the data loading and formating:

In [2]:
class Dictionary(object):
    # use to generate and return the word-to-index (index-to-word) vocabulary dictionary
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
    
    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def return_dict(self):
        return self.idx2word
        
    def __len__(self):
        return len(self.word2idx)


class Corpus(object):
    # load and prepare the corpus  the language models input format
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, n_gram=2):
        with open(path, 'r') as f:
            tokens = 0;
            sample_list = []
            for line in f:
                # add <start> tokens based on the number of n-grams.
                words = ['<start>'] * (n_gram - 1) + line.split() + ['<end>']
                tokens += len(words)
                sample_list.append(words)
                for word in words: 
                    self.dictionary.add_word(word)  

        # # Read corpus and store the each line (word sequence) into its corresponding index sequence.
        ids_list = [[0]*len(s) for s in sample_list if len(s) > n_gram] 
        with open(path, 'r') as f:
            sample_num = 0
            for line in f:
                token = 0
                words = ['<start>'] * (n_gram - 1) + line.split() + ['<end>']
                if len(words) >= n_gram:
                    for word in words:
                        ids_list[sample_num][token] = self.dictionary.word2idx[word]
                        token += 1
                    sample_num += 1

        # FOR THE NEURAL MODEL Convert the flat index sequences into the n-gram tensors which are ready for neural model.
        for n in range(len(ids_list)):
            flat_ids = ids_list[n]
            ids_list[n] = torch.LongTensor([flat_ids[i:i+n_gram] for i in range(len(flat_ids)-(n_gram - 1))])
        return ids_list

### Hyper-parameters for both Language Models: 

In [3]:
# Trigram model
n_gram = 2

# m_gram is the number of preceding/conditioning words 
m_gram = n_gram - 1

### Load the "Penn Treebank" dataset and split it into train/dev/test

In [4]:
corpus = Corpus()
#ids = corpus.get_data('data/train_mini.txt', n_gram)
ids = corpus.get_data('data/train.txt', n_gram)


# Use 70% for training, 15% for development, and 15% for testing 
n_train = round(len(ids) * .75)
n_dev = round(len(ids) * .15)

train_ids = ids[:n_train]
dev_ids = ids[n_train:n_train + n_dev]
test_ids = ids[n_train + n_dev:]

print(f"Number of sentences: {len(train_ids)} train, {len(dev_ids)} dev, {len(test_ids)} test")
vocab_size = len(corpus.dictionary)
print(f"Vocab size: {vocab_size}")

Number of sentences: 31551 train, 6310 dev, 4207 test
Vocab size: 10001


### The class of count-based language model:

In [5]:
class CountLM(object):
    def __init__(self, vocab_size, x_n, x_m, laplace=1):
        self.vocab_size = vocab_size
        self.laplace = laplace
        
        # Dictionaries for tracking the count of n-grams
        self.n_gram_count = self.count_ngrams(x_n)
        self.m_gram_count = self.count_ngrams(x_m)
    
    def count_ngrams(self, x):
        """
        Populate the dictionary with the number of occurrences of each n-gram
        """
        count_list = defaultdict(int)
        for example in x:
            for n_gram in example.tolist():
                count_list[tuple(n_gram)] += 1 
        return count_list
    
    def compute_mle(self, n_gram):
        """
        Compute the MLE of P(w_n|w_{n−1}, ...) with add-one Laplacian smoothing
        
        Please see chapter 3.5.1 of J&M 3rd Ed. for more information
        """
        n_count = self.n_gram_count[n_gram]
        m_gram = n_gram[:-1]
        m_count = self.m_gram_count[m_gram]
        prob = (n_count + self.laplace) / (m_count + self.laplace * self.vocab_size) 
        return prob


### Train the n-gram model based on MLE

In [6]:
#m_gram_train_ids = corpus.get_data('data/train_mini.txt', m_gram)[:-200]
m_gram_train_ids = corpus.get_data('data/train.txt', m_gram)[:n_train]
n_gram_train_ids = train_ids

# Populate the dictionaries with counts from training corpus
count_model = CountLM(vocab_size, n_gram_train_ids, m_gram_train_ids)

# Compute average perplexity on training set
train_ppl = 0
for i in range(0, len(train_ids)):

    probabilities = list(map(lambda x: count_model.compute_mle(tuple(x)), train_ids[i].tolist()))
    perplexity = np.exp(sum(-np.log(probabilities)) / len(train_ids[i]))
    train_ppl += perplexity
    
print('The average training perplexity for count-based LM: '+str(train_ppl/len(train_ids)))

The average training perplexity for count-based LM: 872.2299472899585


In [7]:
# Compute average perplexity on testing set
test_ppl = 0
for i in range(0, len(test_ids)):

    probabilities = list(map(lambda x: count_model.compute_mle(tuple(x)), test_ids[i].tolist()))
    perplexity = np.exp(sum(-np.log(probabilities)) / len(test_ids[i]))
    print('Perplexity for test sample '+str(i)+' :', perplexity)
    test_ppl += perplexity
    
print('The average testing perplexity for count-based LM: '+str(test_ppl/len(test_ids)))

Perplexity for test sample 0 : 481.93397924150236
Perplexity for test sample 1 : 1791.0646372289625
Perplexity for test sample 2 : 295.4281859753384
Perplexity for test sample 3 : 336.8972086530059
Perplexity for test sample 4 : 188.3063912073697
Perplexity for test sample 5 : 1580.4694541715269
Perplexity for test sample 6 : 908.3518812063347
Perplexity for test sample 7 : 587.567516576945
Perplexity for test sample 8 : 466.1199712311848
Perplexity for test sample 9 : 710.2569626653177
Perplexity for test sample 10 : 3201.2726581592956
Perplexity for test sample 11 : 731.7403780164503
Perplexity for test sample 12 : 902.777634164664
Perplexity for test sample 13 : 2107.650302714663
Perplexity for test sample 14 : 867.2708592238805
Perplexity for test sample 15 : 1282.4375182196761
Perplexity for test sample 16 : 516.5791519391194
Perplexity for test sample 17 : 1637.3482160822728
Perplexity for test sample 18 : 891.4693914511101
Perplexity for test sample 19 : 1841.6775228827773
Perpl

Perplexity for test sample 1532 : 299.9255184957937
Perplexity for test sample 1533 : 827.8076804668558
Perplexity for test sample 1534 : 705.3064289852103
Perplexity for test sample 1535 : 663.849079561899
Perplexity for test sample 1536 : 711.7755184372891
Perplexity for test sample 1537 : 2288.6995492379706
Perplexity for test sample 1538 : 402.1920779200347
Perplexity for test sample 1539 : 374.4837679885483
Perplexity for test sample 1540 : 301.62385188074904
Perplexity for test sample 1541 : 268.5909645429137
Perplexity for test sample 1542 : 480.63808049208467
Perplexity for test sample 1543 : 1405.4494167726066
Perplexity for test sample 1544 : 1233.6836836396255
Perplexity for test sample 1545 : 964.158641688546
Perplexity for test sample 1546 : 2632.0574769360564
Perplexity for test sample 1547 : 736.6576720700164
Perplexity for test sample 1548 : 768.7408603227124
Perplexity for test sample 1549 : 1089.5486884190204
Perplexity for test sample 1550 : 940.3460227573968
Perplex

Perplexity for test sample 1719 : 1610.5562179811188
Perplexity for test sample 1720 : 976.6789786030192
Perplexity for test sample 1721 : 1264.777146984313
Perplexity for test sample 1722 : 1054.3261279681315
Perplexity for test sample 1723 : 2075.2127262142326
Perplexity for test sample 1724 : 2172.520245880961
Perplexity for test sample 1725 : 1736.3325040703128
Perplexity for test sample 1726 : 1043.0389655890112
Perplexity for test sample 1727 : 1215.3594752405668
Perplexity for test sample 1728 : 2867.1898601133594
Perplexity for test sample 1729 : 1132.6420371681181
Perplexity for test sample 1730 : 1155.963543866034
Perplexity for test sample 1731 : 3626.5397264805056
Perplexity for test sample 1732 : 915.0132519441463
Perplexity for test sample 1733 : 1577.6316500799273
Perplexity for test sample 1734 : 1456.797368349581
Perplexity for test sample 1735 : 296.7899207543032
Perplexity for test sample 1736 : 389.04008265227964
Perplexity for test sample 1737 : 2002.0387874570877


Perplexity for test sample 3203 : 1066.9051709594128
Perplexity for test sample 3204 : 213.03255375713894
Perplexity for test sample 3205 : 188.68127805103896
Perplexity for test sample 3206 : 668.5470958926691
Perplexity for test sample 3207 : 49.5250665601804
Perplexity for test sample 3208 : 551.8774206459285
Perplexity for test sample 3209 : 1273.9607351336326
Perplexity for test sample 3210 : 349.93082810695375
Perplexity for test sample 3211 : 94.15180533040544
Perplexity for test sample 3212 : 722.0812189465628
Perplexity for test sample 3213 : 61.70886482940158
Perplexity for test sample 3214 : 1275.3574016951318
Perplexity for test sample 3215 : 249.5157749759525
Perplexity for test sample 3216 : 650.2217474816422
Perplexity for test sample 3217 : 333.3065764268792
Perplexity for test sample 3218 : 1745.8192584724918
Perplexity for test sample 3219 : 2341.2912998599363
Perplexity for test sample 3220 : 819.823846339079
Perplexity for test sample 3221 : 3316.222490281745
Perple

### Hyper-parameters for the neural language model

In [8]:
# FOR THE NEURAL MODEL
embed_size = 128
intermediate_size = 1024
num_epochs = 2
learning_rate = 2e-3

### The class of neural language model:

In [9]:
class NeuralLM(nn.Module):
    def __init__(self, vocab_size, embed_size, intermediate_size, m_gram):
        super(NeuralLM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.intermediate = nn.Linear(m_gram * embed_size, intermediate_size)
        self.final = nn.Linear(intermediate_size, vocab_size)
        
    def forward(self, x):
        x = self.embed(x) # Embed word id(s) to vectors
        conc_emb = x.view(x.size(0), x.size(1)*x.size(2))
        intermediate_output = self.intermediate(conc_emb) # one layer of MLP
        intermediate_output = F.relu(intermediate_output) # ReLU non-linear function
        final_out = self.final(intermediate_output) # Map to the vocabulary size output
        return final_out

In [None]:
neural_model = NeuralLM(vocab_size, embed_size, intermediate_size, m_gram).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(neural_model.parameters(), lr=learning_rate, momentum=0.9)

### Train the neural model

In [None]:
# Reduce batch size if you are running out of memory
batch_size = 64
training_data = torch.cat(train_ids, dim=0)
neural_model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for i in tqdm(range(0, len(training_data), batch_size)):
        batch = training_data[i:i + batch_size]
        inputs = batch[:, 0:n_gram-1].to(device)
        targets = batch[:, n_gram-1:].to(device)
        
        # Forward pass
        outputs = neural_model(inputs)
        loss = criterion(outputs, targets.reshape(-1))
        total_loss += loss.item();
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    
    # Calculate the performance (perplexity) of the current trained model on dev set.
    total_ppl = 0
    for i in range(0, len(dev_ids)):
        dev_inputs = dev_ids[i][:, 0:n_gram-1].to(device)
        dev_targets = dev_ids[i][:, n_gram-1:].to(device)
        dev_outputs = neural_model(dev_inputs)
        ce = criterion(dev_outputs, dev_targets.reshape(-1))
        total_ppl += np.exp(ce.item());
    
    print ('Epoch [{}/{}], Training Loss: {:.4f}, Dev Perplexity: {:5.2f}'
        .format(epoch + 1, num_epochs, total_loss/len(train_ids), total_ppl/len(dev_ids)))

### Save the trained model

In [None]:
torch.save(neural_model, 'model.ckpt')

### Neural Model testing

In [None]:
neural_model = torch.load('model.ckpt')
neural_model.eval()
test_ppl = 0
with torch.no_grad():
    for i in range(0, len(test_ids)):
        inputs = test_ids[i][:, 0:n_gram-1].to(device)
        gold = test_ids[i][:, n_gram-1:].to(device)
        output = neural_model(inputs)
        cross_entropy = criterion(output, gold.reshape(-1)).item()
        print('Perplexity for test sample '+str(i)+' :', np.exp(cross_entropy))
        test_ppl += np.exp(cross_entropy)
    print('The average testing perplexity for neural LM: '+str(test_ppl/len(test_ids)))

### Text generation with n-gram language models

In [None]:
np.random.seed(100)
start_tokens = ['we', 'a', 'they']


print('Generating from count-based language model:')
for token in start_tokens:
    
    sentence = [token]
    token_id = corpus.dictionary.word2idx[token]
    
    context = tuple([corpus.dictionary.word2idx['<start>'] for _ in range(m_gram - 1)] + [token_id])
    while sentence[-1] != '<end>':
        filtered_ngrams = [(n_gram_ids[-1], count) for n_gram_ids, count in count_model.n_gram_count.items() if n_gram_ids[:-1] == context]
        candidates = [x[0] for x in filtered_ngrams]
        total_count = sum([x[1] for x in filtered_ngrams])
        probs = [x[1] / total_count for x in filtered_ngrams]
    
        sampled_id = np.random.choice(candidates, 1, p=probs)[0]
        sampled_token = corpus.dictionary.idx2word[sampled_id]
        sentence.append(sampled_token)
        
        context = context[1:] + (sampled_id,)
    print(' '.join(sentence[:-1]))

    
neural_model.eval()
print('\nGenerating from neural language model:')
for token in start_tokens:
    
    sentence = [token]
    token_id = corpus.dictionary.word2idx[token]
    
    context = torch.LongTensor([[corpus.dictionary.word2idx['<start>'] for _ in range(m_gram - 1)] + [token_id]]).to(device)
    
    while sentence[-1] != '<end>':
        output = neural_model(context)
        softmax_output = F.softmax(output, dim=-1)
        prob = softmax_output.squeeze(0).tolist()
        prob = list(map(lambda x: x / sum(prob), prob))
        sampled_id = np.random.choice(output.shape[1], 1, p=prob)[0]
        sampled_token = corpus.dictionary.idx2word[sampled_id]
        sentence.append(sampled_token)
        context = torch.cat([context[:, 1:], torch.LongTensor([[sampled_id]]).to(device)], dim=1)
    print(' '.join(sentence[:-1]))