<a href="https://colab.research.google.com/github/scotthallauer/nlp-feedforward-nn/blob/main/nlp_feedforward_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Assignment 2

**Authors:** Scott Hallauer (HLLSCO001) and Steve Wang (WNGSHU003)

**Date:** 21 June 2021

## Set-Up



In [5]:
import urllib
import re
import torch
import torch.nn as nn
import torch.nn.functional as F

## Prepare Datasets

In [6]:
train_url = "https://raw.githubusercontent.com/scotthallauer/nlp-feedforward-nn/main/xh-data/nchlt_text.xh.train"
valid_url = "https://raw.githubusercontent.com/scotthallauer/nlp-feedforward-nn/main/xh-data/nchlt_text.xh.valid"
test_url = "https://raw.githubusercontent.com/scotthallauer/nlp-feedforward-nn/main/xh-data/nchlt_text.xh.test"

In [51]:
def load_dataset(url):
  """Load a raw dataset from the given URL and convert it into a list of lower-case, punctuation-cleaned sentences."""
  dataset = []
  with urllib.request.urlopen(url) as data:
    for line in data:
      input_sentence = line.decode().split()
      output_sentence = []
      for input_word in input_sentence:
        output_word = input_word.lower()
        output_word = re.sub(r"(\.|\,|\:|\?|\)|\"|\-|\!|\/|\')+$", "", output_word)
        output_word = re.sub(r"^(\(|\"|\-|\/|\')+", "", output_word)
        if len(output_word) > 0:
          output_sentence.append(output_word)
      dataset.append(output_sentence)
  return dataset

def prepare_dataset(dataset):
  """Replace all words that only occur once in the dataset with the <UNK> token."""
  word_counts = {}
  for sentence in dataset:
    for word in sentence:
      if word not in word_counts:
        word_counts[word] = 1
      else:
        word_counts[word] += 1
  for i in range(len(dataset)):
    for j in range(len(dataset[i])):
      if word_counts[dataset[i][j]] == 1:
        dataset[i][j] = "<UNK>"
  return dataset

def get_word_index(dataset):
  word_index = {}
  index = 0
  for sentence in dataset:
    for word in sentence:
      if word not in word_index:
        word_index[word] = index
        index += 1
  return word_index

def get_word_list(word_index):
  word_list = []
  for i in range(len(word_index)):
    word_list.append("");
  for word in word_index:
    word_list[word_index[word]] = word
  return word_list

def get_n_gram_list(dataset, n):
  n_gram_list = []
  for sentence in dataset:
    if len(sentence) >= n:
      for i in range(len(sentence)-(n-1)):
        n_gram_list.append(sentence[i:i+n])
  return n_gram_list

def get_n_gram_counts(n_gram_list):
  n_gram_counts = {}
  for n_gram in n_gram_list:
    n_gram_key = repr(n_gram)
    if n_gram_key not in n_gram_counts:
      n_gram_counts[n_gram_key] = 1
    else:
      n_gram_counts[n_gram_key] += 1
  return n_gram_counts

def get_n_gram_prefix_counts(n_gram_list):
  n_gram_prefix_counts = {}
  for n_gram in n_gram_list:
    n_gram_prefix = n_gram[0:-1]
    n_gram_prefix_key = repr(n_gram_prefix)
    if n_gram_prefix_key not in n_gram_prefix_counts:
      n_gram_prefix_counts[n_gram_prefix_key] = 1
    else:
      n_gram_prefix_counts[n_gram_prefix_key] += 1
  return n_gram_prefix_counts

def get_n_gram_count(n_gram, n_gram_counts):
  n_gram_key = repr(n_gram)
  if n_gram_key not in n_gram_counts:
    return 0
  else:
    return n_gram_counts[n_gram_key]

def get_n_gram_probability(sequence_prefix, target_suffix, n_gram_counts, n_gram_prefix_counts):
  n_gram = sequence_prefix + [target_suffix]
  n_gram_count = get_n_gram_count(n_gram, n_gram_counts)
  n_gram_prefix_count = get_n_gram_count(sequence_prefix, n_gram_prefix_counts)
  if n_gram_count == 0 or n_gram_prefix_count == 0:
    return 0
  else:
    return n_gram_count / n_gram_prefix_count

#def get_n_gram_dist(dataset, n):
#  """P(w_n | w_1 w_2 ... w_n-1)"""
#  word_index = get_word_index(dataset)
#  word_list = get_word_list(word_index)
#  n_gram_list = get_n_gram_list(dataset, n)
#  n_gram_counts = get_n_gram_counts(n_gram_list)
#  n_sub1_gram_list = get_n_gram_list(dataset, n-1)
#  n_sub1_gram_counts = get_n_gram_counts(n_gram_list)
#  n_gram_dist = {}
#  for pre_sequence in n_sub1_gram_list:
#    pre_sequence_key = repr(pre_sequence)
#    if pre_sequence_key not in n_gram_dist:
#      n_gram_dist[pre_sequence_key] = {}
#      for word in word_list:
#        n_gram = pre_sequence + [word]
#        pre_sequence_count = get_n_gram_count(pre_sequence, n_sub1_gram_counts)
#        n_gram_count = get_n_gram_count(n_gram, n_gram_counts)
#        if pre_sequence_count == 0:
#          n_gram_dist[pre_sequence_key][word] = 0
#        else:
#          n_gram_dist[pre_sequence_key][word] = n_gram_count / pre_sequence_count



In [52]:
train_data = load_dataset(train_url)
train_data = prepare_dataset(train_data)

In [53]:
word_index = get_word_index(train_data)
word_list = get_word_list(word_index)

In [54]:
print(len(word_list))

62244


In [55]:
print(train_data[70850])

['akufuneki', '<UNK>', '<UNK>', 'axhonywe', 'kwindlela', 'yokuphosa']


In [118]:
n_gram_list = get_n_gram_list(train_data, 3)
n_gram_counts = get_n_gram_counts(n_gram_list)

In [123]:
print(n_gram_list[5])
print(n_gram_counts[repr(n_gram_list[5])])

['ngocoselelo', 'njengoko', 'siqulethe']
3


In [56]:
n_gram_list = get_n_gram_list(train_data, 3)
n_gram_counts = get_n_gram_counts(n_gram_list)
n_gram_prefix_counts = get_n_gram_prefix_counts(n_gram_list)

In [57]:
sequence_prefix = ['<UNK>', '<UNK>']
target_suffix = 'axhonywe'
print(get_n_gram_probability(sequence_prefix, target_suffix, n_gram_counts, n_gram_prefix_counts))

0.00012359411692003462


## Create MLP Model


In [None]:
# source: https://www.oreilly.com/library/view/natural-language-processing/9781491978221/ch04.html
# source: https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html
# source: https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/01-basics/feedforward_neural_network/main.py#L37-L49
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultilayerPerceptron(nn.Module):
    def __init__(self, context_size, embedding_size, vocabulary_size):
        super(MultilayerPerceptron, self).__init__()
        self.fc1 = nn.Linear(context_size, embedding_size)
        self.fc2 = nn.Linear(embedding_size, vocabulary_size)

    def forward(self, x_in):
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)
        return  F.softmax(output, dim=1)

In [None]:
batch_size = 3 # number of samples input at once
input_dim = 6
hidden_dim = 3
output_dim = 50

# Initialize model
mlp = MultilayerPerceptron(input_dim, hidden_dim, output_dim)
print(mlp)

In [None]:
def describe(x):
    print("Type: {}".format(x.type()))
    print("Shape/size: {}".format(x.shape))
    print("Values: \n{}".format(x))

###########################
#Format                   #
#Batch1: 
#[ [word1, word 2, tagert],
#  [word1, word 2, tagert],
#  [word1, word 2, tagert],
#]

#x = [[[0, 10, 2 ,1, 2, 4]],
#    [[1, 11, 1 ,3, 2, 5]],
#    [[9, 10, 6, 7, 6, 5]]]
#x_input = torch.FloatTensor(x)
x_input = torch.rand(batch_size, input_dim)
describe(x_input)

In [None]:
y_output = mlp(x_input)
describe(y_output)

Train Model

In [None]:
learning_rate = 20

# Use cross entroy loss as loss function
cross_entropy_loss = nn.CrossEntropyLoss()

# Use stochastic gradient descent as optimization function
gradient_decent_optimiser = torch.optim.SGD(mlp.parameters(), lr=learning_rate)

print('Training log:')
num_batches = len(train_loader)
for epoch in range(num_epochs):
  print('Start Training')
  for i, (example_input, example_output) in enumerate(train_loader):
    # Forward pass
    output = mlp(example_input)
    loss = cross_entropy_loss(output, example_output)

    # Backward Propagation and optimisation
    gradient_decent_optimiser.zero_grad
    loss.backward()
    gradient_decent_optimiser.step()

    perplexity = torch.exp(loss)
    if i == (num_batches - 1):
      print('|epoch {}| {}/{} batches | lr {} | ms/batch {}| loss {:.2f}| ppl {:.2f}'.format(epoch+1, (i+1)*batch_size, num_batch, learning_rate, batch_time, loss.item(), perplexity))
  # TODO: validation
  print('-------------------------------------------------------------------')
  print('|end of epoch {}| time: {}s| valid loss {:.2f} | valid ppl {:.2f}'.format(epoch+1, epoch_time, valid_loss, valid_ppl))
  print('-------------------------------------------------------------------')
  