# Neural Language Model for CPSC 503 Assignment 2
### This notebook is modified from Yunjey Choi's Github repository - pytorch-tutorial.
### pytorch-tutorial/tutorials/02-intermediate/language_model/
### https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/02-intermediate/language_model

### ========================================================================================

### First of the first, let's load a number of dependencies:

In [1]:
import torch
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_
import torch.nn.functional as F
import numpy as np
import os
import nltk

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ModuleNotFoundError: No module named 'torch'

### Here are two classes needed for the data loading and formating:

In [112]:
class Dictionary(object):
    # < MISSING CLASS DESCRIPTION >
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
    
    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def return_dict(self):
        return self.idx2word
        
    def __len__(self):
        return len(self.word2idx)


class Corpus(object):
    # < MISSING CLASS DESCRIPTION >
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, n_gram=2):
        with open(path, 'r') as f:
            tokens = 0;
            sample_list = []
            for line in f:
                words = ['<start>'] * n_gram + line.split() # < MISSING COMMENT >
                tokens += len(words)
                sample_list.append(words)
                for word in words: 
                    self.dictionary.add_word(word)  
        # < MISSING COMMENT FOR THE FOLLOWING PIECE OF CODE >
        ids_list = [[0]*len(s) for s in sample_list if len(s) > n_gram] 
        with open(path, 'r') as f:
            sample_num = 0
            for line in f:
                token = 0
                words = ['<start>'] * n_gram + line.split()
                if len(words) > n_gram:
                    for word in words:
                        ids_list[sample_num][token] = self.dictionary.word2idx[word]
                        token += 1
                    sample_num += 1
        # < MISSING COMMENT FOR THE FOLLOWING PIECE OF CODE >
        for n in range(len(ids_list)):
            flat_ids = ids_list[n]
            ids_list[n] = torch.LongTensor([flat_ids[i:i+n_gram+1] for i in range(len(flat_ids)-n_gram)])
        return ids_list

### Hyper-parameters: 

In [113]:
n_gram = 3

embed_size = 128
intermediate_size = 1024
num_epochs = 10
learning_rate = 0.002

### Load the "Penn Treebank" dataset and split it into train/dev/test

In [115]:
corpus = Corpus()
ids = corpus.get_data('data/train_mini.txt', n_gram-1)
train_ids = ids[:-200]
dev_ids = ids[-200:-100]
test_ids = ids[-100:]
vocab_size = len(corpus.dictionary)

### The class of n-gram language model:

In [None]:
class NGramLM(object):
    def __init__(self, n, train_data, laplace=1):
        self.n = n
        self.laplace = laplace
        self.tokens = preprocess(train_data, n)
        self.vocab  = nltk.FreqDist(self.tokens)
        
        if self.n == 1:
            num_tokens = len(self.tokens)
            self.model = { (unigram,): count / num_tokens for unigram, count in self.vocab.items() }
        else:
            vocab_size = len(self.vocab)
    
            n_grams = nltk.ngrams(self.tokens, self.n)
            n_vocab = nltk.FreqDist(n_grams)

            self.model = { n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items() }
        
#         self.masks  = list(reversed(list(product((0,1), repeat=n))))
        
    def smoothed_count(self, n_gram, n_count):
        m_gram = n_gram[:-1]
        m_count = m_vocab[m_gram]
        return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)


In [None]:
# Training
n = 2 
model(2, train_data=train_ids)

In [None]:
# Testing
for i in range(0, len(test_ids)):
    known_ngrams  = (self._convert_oov(ngram) for ngram in test_ngrams)
    probabilities = [model[ngram] for ngram in known_ngrams]
    print('Perplexity for test sample '+str(i)+' :', math.exp((-1/N) * sum(map(math.log, probabilities))))
    test_ppl += np.exp(cross_entropy)
    print('The average testing perplexity: '+str(test_ppl/len(test_ids)))

### The class of neural language model:

In [116]:
class LM(nn.Module):
    def __init__(self, vocab_size, embed_size, intermediate_size, n_gram):
        super(LM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.intermediate = nn.Linear(n_gram * embed_size, intermediate_size)
        self.linear = nn.Linear(intermediate_size, vocab_size)
        
    def forward(self, x):
        x = self.embed(x) # < MISSING COMMENT >
        conc_emb = x.view(x.size(0), x.size(1)*x.size(2))
        #conc_emb = torch.cat([x[:,0,:], x[:,1,:]],1)
        intermediate_output = self.intermediate(conc_emb) # < MISSING COMMENT >
        final_out = self.linear(intermediate_output) # < MISSING COMMENT >
        return final_out

In [117]:
model = LM(vocab_size, embed_size, intermediate_size, n_gram-1).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Train the model

In [118]:
for epoch in range(num_epochs):
    avg_ppl = 0; avg_loss = 0;
    for i in range(0, len(train_ids)):
        inputs = train_ids[i][:, 0:n_gram-1].to(device)
        targets = train_ids[i][:, n_gram-1:].to(device)
        
        # < MISSING COMMENT FOR THE FOLLOWING PIECE OF CODE >
        outputs = model(inputs)
        loss = criterion(outputs, targets.reshape(-1))
        avg_loss += loss.item();
        
        # < MISSING COMMENT FOR THE FOLLOWING PIECE OF CODE >
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
    
    # < MISSING COMMENT FOR THE FOLLOWING PIECE OF CODE >
    for i in range(0, len(dev_ids)):
        dev_inputs = dev_ids[i][:, 0:n_gram-1].to(device)
        dev_targets = dev_ids[i][:, n_gram-1:].to(device)
        dev_outputs = model(dev_inputs)
        ce = criterion(dev_outputs, dev_targets.reshape(-1))
        avg_ppl += np.exp(ce.item());
    
    print ('Epoch [{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
        .format(epoch + 1, num_epochs, avg_loss/len(train_ids), avg_ppl/len(dev_ids)))

Epoch [1/10], Loss: 7.4570, Perplexity: 5247.26
Epoch [2/10], Loss: 5.2374, Perplexity: 63173.13
Epoch [3/10], Loss: 4.7159, Perplexity: 255881.86
Epoch [4/10], Loss: 4.4015, Perplexity: 935003.10
Epoch [5/10], Loss: 4.2168, Perplexity: 1637671.15
Epoch [6/10], Loss: 4.0399, Perplexity: 4890911.58
Epoch [7/10], Loss: 3.8980, Perplexity: 21259075.32
Epoch [8/10], Loss: 3.7395, Perplexity: 42372702.08
Epoch [9/10], Loss: 3.6187, Perplexity: 58546160.20
Epoch [10/10], Loss: 3.4938, Perplexity: 185843332.48


### Save the trained model

In [119]:
torch.save(model, 'model.ckpt')

  "type " + obj.__name__ + ". It won't be checked "


### Model testing

In [120]:
model = torch.load('model.ckpt')
model.eval()
test_ppl = 0
with torch.no_grad():
    for i in range(0, len(test_ids)):
        inputs = test_ids[i][:, 0:n_gram-1].to(device)
        gold = test_ids[i][:, n_gram-1:].to(device)
        output = model(inputs)
        cross_entropy = criterion(output, gold.reshape(-1)).item()
        print('Perplexity for test sample '+str(i)+' :', np.exp(cross_entropy))
        test_ppl += np.exp(cross_entropy)
    print('The average testing perplexity: '+str(test_ppl/len(test_ids)))

Perplexity for test sample 0 : 5903111.050083881
Perplexity for test sample 1 : 21729104.296588473
Perplexity for test sample 2 : 794420922.7723972
Perplexity for test sample 3 : 5445225407.479174
Perplexity for test sample 4 : 10259365.496464562
Perplexity for test sample 5 : 587957.2516781432
Perplexity for test sample 6 : 166295.49384946204
Perplexity for test sample 7 : 1639064.3456197989
Perplexity for test sample 8 : 4638.557433995015
Perplexity for test sample 9 : 85431509.58639324
Perplexity for test sample 10 : 26697245.503057465
Perplexity for test sample 11 : 1353401.5358296905
Perplexity for test sample 12 : 238675.04857059504
Perplexity for test sample 13 : 6576910.728150512
Perplexity for test sample 14 : 1250341.257785007
Perplexity for test sample 15 : 1545230.7107080014
Perplexity for test sample 16 : 804875.6169132207
Perplexity for test sample 17 : 895726.1208184625
Perplexity for test sample 18 : 4526318.283080646
Perplexity for test sample 19 : 666687.7637842282
Pe