<a href="https://colab.research.google.com/github/scotthallauer/nlp-feedforward-nn/blob/main/nlp_feedforward_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Assignment 2

**Authors:** Scott Hallauer (HLLSCO001) and Steve Wang (WNGSHU003)

**Date:** 21 June 2021

## Set-Up



Import libraries

In [None]:
import urllib
import re
import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

Set model parameters


In [None]:
# Manual parameters
NGRAM_SIZE = 3
EMBEDDING_DIM = 10
BATCH_SIZE = 400
LEARNING_RATE = 0.1

# Calculated parameters
CONTEXT_SIZE = NGRAM_SIZE-1

In [None]:
train_url = "https://raw.githubusercontent.com/scotthallauer/nlp-feedforward-nn/main/xh-data/nchlt_text.xh.train"
valid_url = "https://raw.githubusercontent.com/scotthallauer/nlp-feedforward-nn/main/xh-data/nchlt_text.xh.valid"
test_url = "https://raw.githubusercontent.com/scotthallauer/nlp-feedforward-nn/main/xh-data/nchlt_text.xh.test"

## Prepare Datasets (OLD)


In [None]:
def load_dataset(url):
  """Load a raw dataset from the given URL and convert it into a list of lower-case, punctuation-cleaned sentences."""
  dataset = []
  with urllib.request.urlopen(url) as data:
    for line in data:
      input_sentence = line.decode().split()
      output_sentence = []
      for input_word in input_sentence:
        output_word = input_word.lower()
        output_word = re.sub(r"(\.|\,|\:|\?|\)|\"|\-|\!|\/|\')+$", "", output_word)
        output_word = re.sub(r"^(\(|\"|\-|\/|\')+", "", output_word)
        if len(output_word) > 0:
          output_sentence.append(output_word)
      dataset.append(output_sentence)
  return dataset

def prepare_dataset(dataset):
  """Replace all words that only occur once in the dataset with the <UNK> token."""
  word_counts = {}
  for sentence in dataset:
    for word in sentence:
      if word not in word_counts:
        word_counts[word] = 1
      else:
        word_counts[word] += 1
  for i in range(len(dataset)):
    for j in range(len(dataset[i])):
      if word_counts[dataset[i][j]] == 1:
        dataset[i][j] = "<UNK>"
  return dataset

def get_word_index(dataset):
  word_index = {}
  index = 0
  for sentence in dataset:
    for word in sentence:
      if word not in word_index:
        word_index[word] = index
        index += 1
  return word_index

def get_word_list(word_index):
  word_list = []
  for i in range(len(word_index)):
    word_list.append("");
  for word in word_index:
    word_list[word_index[word]] = word
  return word_list

def get_n_grams(dataset, n):
  n_grams = []
  for sentence in dataset:
    if len(sentence) >= n:
      for i in range(len(sentence)-(n-1)):
        n_grams.append((sentence[i:i+(n-1)], sentence[i+(n-1)]))
  return n_grams

def get_n_gram_counts(n_grams):
  n_gram_counts = {}
  for context, target in n_grams:
    n_gram_key = repr((context, target))
    if n_gram_key not in n_gram_counts:
      n_gram_counts[n_gram_key] = 1
    else:
      n_gram_counts[n_gram_key] += 1
  return n_gram_counts

def get_n_gram_context_counts(n_grams):
  n_gram_context_counts = {}
  for context, target in n_grams:
    n_gram_context_key = repr(context)
    if n_gram_context_key not in n_gram_context_counts:
      n_gram_context_counts[n_gram_context_key] = 1
    else:
      n_gram_context_counts[n_gram_context_key] += 1
  return n_gram_context_counts

def get_n_gram_count(n_gram, n_gram_counts):
  n_gram_key = repr(n_gram)
  if n_gram_key not in n_gram_counts:
    return 0
  else:
    return n_gram_counts[n_gram_key]

def get_n_gram_probability(context, target, n_gram_counts, n_gram_context_counts):
  """P(w_n | w_1 w_2 ... w_n-1)"""
  n_gram = (context, target)
  n_gram_count = get_n_gram_count(n_gram, n_gram_counts)
  n_gram_context_count = get_n_gram_count(context, n_gram_context_counts)
  if n_gram_count == 0 or n_gram_context_count == 0:
    return 0
  else:
    return n_gram_count / n_gram_context_count

#def get_n_gram_dist(dataset, n):
#  """P(w_n | w_1 w_2 ... w_n-1)"""
#  word_index = get_word_index(dataset)
#  word_list = get_word_list(word_index)
#  n_gram_list = get_n_gram_list(dataset, n)
#  n_gram_counts = get_n_gram_counts(n_gram_list)
#  n_sub1_gram_list = get_n_gram_list(dataset, n-1)
#  n_sub1_gram_counts = get_n_gram_counts(n_gram_list)
#  n_gram_dist = {}
#  for pre_sequence in n_sub1_gram_list:
#    pre_sequence_key = repr(pre_sequence)
#    if pre_sequence_key not in n_gram_dist:
#      n_gram_dist[pre_sequence_key] = {}
#      for word in word_list:
#        n_gram = pre_sequence + [word]
#        pre_sequence_count = get_n_gram_count(pre_sequence, n_sub1_gram_counts)
#        n_gram_count = get_n_gram_count(n_gram, n_gram_counts)
#        if pre_sequence_count == 0:
#          n_gram_dist[pre_sequence_key][word] = 0
#        else:
#          n_gram_dist[pre_sequence_key][word] = n_gram_count / pre_sequence_count



Load and clean the training dataset. Then replace all words which only occur once with the \<UNK\> token.

In [None]:
train_data = load_dataset(train_url)
train_data = prepare_dataset(train_data)

From the training dataset, we now extract the vocabulary and generate a dictionary with the word-to-index mappings.

In [None]:
word_index = get_word_index(train_data)
vocabulary = get_word_list(word_index)

Finally, we extract all occurrences of n-grams from the training dataset and get the corresponding counts of both the full n-grams and the prefix (from 1 to n-1) n-grams.

In [None]:
n_grams = get_n_grams(train_data, N_GRAM_SIZE)
n_gram_counts = get_n_gram_counts(n_grams)
context_counts = get_n_gram_context_counts(n_grams)

All code blocks below are just for exploring the newly created representations of the dataset.

In [None]:
print(len(vocabulary))

62244


In [None]:
print(train_data[70835])

['xa', '<UNK>', 'abalobi', 'ababandakanyekayo', 'neegiyeri', 'kufuneka', '<UNK>', 'ezomeleleyo', 'ezibukhali', 'kunye/okanye', 'amazembe', 'ngexesha', 'lokusebenza', 'ukusika', 'iihaki', 'ezithintelayo', 'okanye', 'imitya', 'yokuloba']


In [None]:
print(n_grams[5])
print(n_gram_counts[repr(n_grams[5])])

(['ngocoselelo', 'njengoko'], 'siqulethe')
3


In [None]:
context = ["xa", "<UNK>"]
target = "abalobi"
print(get_n_gram_probability(context, target, n_gram_counts, context_counts))

0.0024875621890547263


## Prepare Datasets (NEW)

In [None]:
def load_dataset(url):
  dataset = []
  with urllib.request.urlopen(url) as data:
    for line in data:
      input_sentence = line.decode().split()
      output_sentence = []
      for input_word in input_sentence:
        output_word = input_word.lower()
        output_word = re.sub(r"(\.|\,|\:|\?|\)|\"|\-|\!|\/|\')+$", "", output_word)
        output_word = re.sub(r"^(\(|\"|\-|\/|\')+", "", output_word)
        if len(output_word) > 0:
          output_sentence.append(output_word)
      dataset.append(output_sentence)
  word_counts = {}
  for sentence in dataset:
    for word in sentence:
      if word not in word_counts:
        word_counts[word] = 1
      else:
        word_counts[word] += 1
  for i in range(len(dataset)):
    for j in range(len(dataset[i])):
      if word_counts[dataset[i][j]] == 1:
        dataset[i][j] = "<unk>"
  return dataset

def get_word_index(sentences):
  word_index = {}
  index = 0
  for sentence in sentences:
    for word in sentence:
      if word not in word_index:
        word_index[word] = index
        index += 1
  return word_index

def get_vocab(word_index):
  vocab = []
  for i in range(len(word_index)):
    vocab.append("");
  for word in word_index:
    vocab[word_index[word]] = word
  return vocab

def get_ngrams(sentences, n):
  ngrams = []
  for sentence in sentences:
    if len(sentence) >= n:
      for i in range(len(sentence)-(n-1)):
        ngrams.append((sentence[i:i+(n-1)], sentence[i+(n-1)]))
  return ngrams

In [None]:
class XhosaTextDataset(Dataset):
  def __init__(self, dataset_url, ngram_size):
    self.sentences = load_dataset(dataset_url)
    self.word_index = get_word_index(self.sentences)
    self.vocab = get_vocab(self.word_index)
    self.ngrams = get_ngrams(self.sentences, ngram_size)
    self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

  def __len__(self):
    return len(self.ngrams)

  def __getitem__(self, idx):
    context = []
    for word in self.ngrams[idx][0]:
      context.append(self.word_index[word])
    target = self.word_index[self.ngrams[idx][1]]
    return torch.tensor(context, dtype=torch.long, device=torch.device(self.device)), torch.tensor(target, dtype=torch.long, device=torch.device(self.device))

  def vocab_size(self):
    return len(self.vocab)

In [None]:
train_dataset = XhosaTextDataset(
  dataset_url=train_url, 
  ngram_size=NGRAM_SIZE,
)
print(f"Training Dataset: {train_dataset.__len__()} ngrams.")

valid_dataset = XhosaTextDataset(
  dataset_url=valid_url, 
  ngram_size=NGRAM_SIZE,
)
print(f"Validation Dataset: {valid_dataset.__len__()} ngrams.")

test_dataset = XhosaTextDataset(
  dataset_url=test_url, 
  ngram_size=NGRAM_SIZE,
)
print(f"Testing Dataset: {test_dataset.__len__()} ngrams.")

Training Dataset: 859472 ngrams.
Validation Dataset: 47430 ngrams.
Testing Dataset: 47911 ngrams.


In [None]:
train_dataset.__getitem__(400)

(tensor([22, 46], device='cuda:0'), tensor(319, device='cuda:0'))

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
train_contexts, train_targets = next(iter(train_dataloader))
print(f"Contexts batch shape: {train_contexts.size()}")
print(f"Targets batch shape: {train_targets.size()}")
context = train_contexts[0]
target = train_targets[0]
print(f"Context: {context}")
print(f"Target: {target}")

Contexts batch shape: torch.Size([200, 2])
Targets batch shape: torch.Size([200])
Context: tensor([10930, 36206], device='cuda:0')
Target: 4172


## Define Neural Network Model



In [None]:
# source: https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html
class NGramLanguageModeler(nn.Module):
  def __init__(self, vocab_size, embedding_dim, context_size, batch_size):
    super(NGramLanguageModeler, self).__init__()
    self.embedding_dim = embedding_dim
    self.context_size = context_size
    self.batch_size = batch_size
    self.embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.linear1 = nn.Linear(context_size * embedding_dim, 128)
    self.linear2 = nn.Linear(128, vocab_size)
  
  def forward(self, inputs):
    embeds = self.embeddings(inputs).view((len(inputs), self.context_size * self.embedding_dim))
    out = F.relu(self.linear1(embeds))
    out = self.linear2(out)
    log_probs = F.log_softmax(out, dim=1)
    return log_probs

## Train Neural Network Model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_log_interval = 200

model = NGramLanguageModeler(train_dataset.vocab_size(), EMBEDDING_DIM, CONTEXT_SIZE, BATCH_SIZE).to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

for epoch_i in range(10):

  print('Start Training')

  # Start timing epoch
  epoch_start_time = datetime.datetime.now()

  # TRAINING PHASE
  model.train()
  total_batch_elapsed_time_ms = 0
  total_batch_loss = 0
  total_batch_ppl = 0

  for batch_i, (train_context, train_target) in enumerate(train_dataloader):

    # Start timing batch
    batch_start_time = datetime.datetime.now()

    # Zero gradients from old input
    model.zero_grad()

    # Forward pass
    log_probs = model(train_context)

    # Get loss
    loss = loss_function(log_probs, train_target)
    total_batch_loss += loss.item()

    # Get perplexity
    ppl = torch.exp(loss)
    total_batch_ppl += ppl.item()

    # Backward propagation
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Stop timing batch
    batch_end_time = datetime.datetime.now()
    batch_elapsed_time = (batch_end_time - batch_start_time)
    batch_elapsed_time_ms = batch_elapsed_time.total_seconds() * 1000
    total_batch_elapsed_time_ms += batch_elapsed_time_ms

    # Log training stats in batch intervals
    if ((batch_i+1) % batch_log_interval == 0):
      print("| epoch {} | {}/ {} batches | lr {:.2f} | ms/batch {:.2f} | loss {:.2f} | ppl {:.2f}".format(epoch_i+1, batch_i+1, len(train_dataloader), LEARNING_RATE, total_batch_elapsed_time_ms/batch_log_interval, total_batch_loss/batch_log_interval, total_batch_ppl/batch_log_interval))
      total_batch_elapsed_time_ms = 0
      total_batch_loss = 0
      total_batch_ppl = 0
  
  # VALIDATION PHASE
  model.eval()
  total_valid_loss = 0
  total_valid_ppl = 0

  for batch_i, (valid_context, valid_target) in enumerate(valid_dataloader):

    # Forward pass
    log_probs = model(valid_context)

    # Get loss
    loss = loss_function(log_probs, valid_target)
    total_valid_loss += loss.item()

    # Get perplexity
    ppl = torch.exp(loss)
    total_valid_ppl += ppl.item()

  # Stop timing epoch
  epoch_end_time = datetime.datetime.now()
  epoch_elapsed_time = (epoch_end_time - epoch_start_time)
  epoch_elapsed_time_s = epoch_elapsed_time.total_seconds()

  # Log validation stats at end of each epoch
  print("-------------------------------------------------------------------")
  print("| end of epoch {} | time: {:.2f}s | valid loss {:.2f} | valid ppl {:.2f}".format(epoch_i+1, epoch_elapsed_time_s, total_valid_loss/len(valid_dataloader), total_valid_ppl/len(valid_dataloader)))
  print("-------------------------------------------------------------------")

# TESTING PHASE
model.eval()
total_test_loss = 0
total_test_ppl = 0

for batch_i, (test_context, test_target) in enumerate(test_dataloader):

  # Forward pass
  log_probs = model(test_context)

  # Get loss
  loss = loss_function(log_probs, test_target)
  total_test_loss += loss.item()

  # Get perplexity
  ppl = torch.exp(loss)
  total_test_ppl += ppl.item()

# Log testing stats at end of training
print("=========================================================================================")
print("| End of training | test loss {:.2f} | test ppl {:.2f}".format(total_test_loss/len(test_dataloader), total_test_ppl/len(test_dataloader)))
print("=========================================================================================")

Start Training
| epoch 1 | 200/ 2149 batches | lr 0.10 | ms/batch 6.43 | loss 10.63 | ppl 42436.93
| epoch 1 | 400/ 2149 batches | lr 0.10 | ms/batch 6.30 | loss 10.27 | ppl 28954.47
| epoch 1 | 600/ 2149 batches | lr 0.10 | ms/batch 6.39 | loss 10.10 | ppl 24610.42
| epoch 1 | 800/ 2149 batches | lr 0.10 | ms/batch 6.37 | loss 9.95 | ppl 21173.18
| epoch 1 | 1000/ 2149 batches | lr 0.10 | ms/batch 6.40 | loss 9.80 | ppl 18134.81
| epoch 1 | 1200/ 2149 batches | lr 0.10 | ms/batch 6.32 | loss 9.67 | ppl 15934.10
| epoch 1 | 1400/ 2149 batches | lr 0.10 | ms/batch 6.36 | loss 9.53 | ppl 13872.95
| epoch 1 | 1600/ 2149 batches | lr 0.10 | ms/batch 6.36 | loss 9.47 | ppl 13102.55
| epoch 1 | 1800/ 2149 batches | lr 0.10 | ms/batch 6.32 | loss 9.38 | ppl 11992.14
| epoch 1 | 2000/ 2149 batches | lr 0.10 | ms/batch 6.35 | loss 9.31 | ppl 11219.87
-------------------------------------------------------------------
| end of epoch 1 | time: 65.35s | valid loss 9.85 | valid ppl 19016.59
-------

## Create MLP Model
Multilayer

In [None]:
# source: https://www.oreilly.com/library/view/natural-language-processing/9781491978221/ch04.html
# source: https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html
# source: https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/01-basics/feedforward_neural_network/main.py#L37-L49
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultilayerPerceptron(nn.Module):
    def __init__(self, context_size, embedding_size, vocabulary_size):
        super(MultilayerPerceptron, self).__init__()
        self.fc1 = nn.Linear(context_size, embedding_size)
        self.fc2 = nn.Linear(embedding_size, vocabulary_size)

    def forward(self, x_in):
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)
        return  F.softmax(output, dim=1)

In [None]:
batch_size = 3 # number of samples input at once
input_dim = 6
hidden_dim = 3
output_dim = 50

# Initialize model
mlp = MultilayerPerceptron(input_dim, hidden_dim, output_dim)
print(mlp)

MultilayerPerceptron(
  (fc1): Linear(in_features=6, out_features=3, bias=True)
  (fc2): Linear(in_features=3, out_features=50, bias=True)
)


In [None]:
def describe(x):
    print("Type: {}".format(x.type()))
    print("Shape/size: {}".format(x.shape))
    print("Values: \n{}".format(x))

###########################
#Format                   #
#Batch1: 
#[ [word1, word 2, tagert],
#  [word1, word 2, tagert],
#  [word1, word 2, tagert],
#]

x_input = torch.rand(batch_size, input_dim)
describe(x_input)

Type: torch.FloatTensor
Shape/size: torch.Size([3, 6])
Values: 
tensor([[0.8654, 0.9656, 0.4177, 0.3434, 0.9115, 0.7048],
        [0.2483, 0.4000, 0.4714, 0.8006, 0.0089, 0.3843],
        [0.2623, 0.7188, 0.9339, 0.5699, 0.2633, 0.6035]])


In [None]:
y_output = mlp(x_input)
describe(y_output)

Type: torch.FloatTensor
Shape/size: torch.Size([3, 50])
Values: 
tensor([[0.0239, 0.0413, 0.0146, 0.0167, 0.0166, 0.0132, 0.0379, 0.0147, 0.0109,
         0.0185, 0.0095, 0.0062, 0.0070, 0.0320, 0.0161, 0.0112, 0.0138, 0.0076,
         0.0473, 0.0206, 0.0067, 0.0398, 0.0180, 0.0255, 0.0126, 0.0543, 0.0143,
         0.0131, 0.0216, 0.0137, 0.0202, 0.0120, 0.0078, 0.0069, 0.0112, 0.0141,
         0.0108, 0.0115, 0.0291, 0.0367, 0.0293, 0.0426, 0.0154, 0.0271, 0.0158,
         0.0189, 0.0142, 0.0425, 0.0091, 0.0256],
        [0.0226, 0.0428, 0.0120, 0.0145, 0.0161, 0.0119, 0.0340, 0.0142, 0.0101,
         0.0168, 0.0089, 0.0062, 0.0063, 0.0338, 0.0169, 0.0110, 0.0125, 0.0084,
         0.0496, 0.0237, 0.0066, 0.0409, 0.0202, 0.0206, 0.0135, 0.0618, 0.0149,
         0.0141, 0.0217, 0.0140, 0.0169, 0.0115, 0.0089, 0.0073, 0.0098, 0.0144,
         0.0095, 0.0096, 0.0327, 0.0361, 0.0321, 0.0468, 0.0134, 0.0251, 0.0162,
         0.0191, 0.0167, 0.0414, 0.0084, 0.0229],
        [0.0221, 0.0435, 

Train Model

In [None]:
import datetime

learning_rate = 20

# Use cross entroy loss as loss function
cross_entropy_loss = nn.CrossEntropyLoss()

# Use stochastic gradient descent as optimization function
gradient_decent_optimiser = torch.optim.SGD(mlp.parameters(), lr=learning_rate)

print('Training log:')
num_batches = len(train_loader)
for epoch in range(num_epochs):
  print('Start Training')
  epoch_start_time = datetime.datetime.now()

  for i, (example_input, example_output) in enumerate(train_loader):
    start_time = datetime.datetime.now()

    # Forward pass
    output = mlp(example_input)
    loss = cross_entropy_loss(output, example_output)

    # Backward Propagation and optimisation
    gradient_decent_optimiser.zero_grad
    loss.backward()
    gradient_decent_optimiser.step()
    
    # Calculate time
    end_time = datetime.datetim.now()
    time_diff = (end_time - start_time)
    time_diff_ms = time_diff * 1000

    perplexity = torch.exp(loss)
    if i == (num_batches - 1):
      print('|epoch {}| {}/{} batches | lr {} | ms/batch {}| loss {:.2f}| ppl {:.2f}'.format(epoch+1, (i+1)*batch_size, num_batch, learning_rate, time_diff_ms, loss.item(), perplexity))
  
  for i, (valid_input, valid_output) in enumerate(valid_loader):
    output = mlp(valid_input)
    valid_loss = cross_entropy_loss(output, valid_output)
    

  epoch_end_time = datatime.datetime.now()
  epoch_time = (epoch_end_time - epoch_start_time)
  print('-------------------------------------------------------------------')
  print('|end of epoch {}| time: {}s| valid loss {:.2f} | valid ppl {:.2f}'.format(epoch+1, epoch_time, valid_loss, valid_ppl))
  print('-------------------------------------------------------------------')
  

Training log:


NameError: ignored