# NLP Assignment 2
Samarth Ramesh

BMC201722

In [None]:
import csv
import collections
from nltk.util import ngrams
import numpy as np
import matplotlib.pyplot as plt
import random
import math

In [None]:
#Loads the data from csv file at the end of the first assignment
with open("/content/drive/MyDrive/Colab Data/words.csv", 'r') as read_obj:
    # pass the file object to reader() to get the reader object
    csv_reader = csv.reader(read_obj)
    # Pass reader object to list() to get a list of lists
    words_data = list(csv_reader)

In [None]:
#Corpus size in number of words and senteneces and the vocabulary size.
print("Number of sentences in the corpus: "+str(len(words_data)))
words = [word for sen in words_data for word in sen]
print("Number of words in the corpus: "+str(len(words)))
print("Size of the vocabulary: "+str(len(list(set(words)))))

Number of sentences in the corpus: 71703
Number of words in the corpus: 1284387
Size of the vocabulary: 58699


In [None]:
def find_ngrams(words_data, n=3):
  '''
  Construct ngrams for some n value using words_data
  '''
  data = []
  for sen in words_data:
    sen_ngrams = list(ngrams(sen, n, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
    data += sen_ngrams
  return data

In [None]:
def build_trigram_model(trigrams_data, withcounts = False):
  '''
  Builds trigram model using trigrams generated from data. Counts occure of each trigram
  and gives the likelihood of each trigram.
  '''
  trigram_model_counts = collections.defaultdict(lambda : collections.defaultdict(lambda : collections.defaultdict(lambda : 0)))
  for gram in trigrams_data:
    trigram_model_counts[gram[0]][gram[1]][gram[2]] +=1
  trigram_model = trigram_model_counts
  for w1 in trigram_model:
    for w2 in trigram_model[w1]:
      total = float(sum(trigram_model[w1][w2].values()))
      for w3 in trigram_model[w1][w2]:
        trigram_model[w1][w2][w3] /= total
  if withcounts:
    return trigram_model, trigram_model_counts
  else:
    return trigram_model

def predict_next_word_tri(trigram_model, second_word='<s>', first_word='<s>', shouldplot = False):
  '''
  Predicts the next word using probabilities of trigram model and given firat and second word.
  '''
  third_word = trigram_model[first_word][second_word]
  top10words = collections.Counter(third_word).most_common(10)

  predicted_words = list(zip(*top10words))[0]
  prob_scores = list(zip(*top10words))[1]
  x_pos = np.arange(len(predicted_words))

  if shouldplot:
    plt.bar(x_pos, prob_scores, align = 'center')
    plt.xticks(x_pos, predicted_words)
    plt.ylabel('Probability Score')
    plt.xlabel('Predicted Words')
    plt.title('Predicted words for ' + first_word + ' ' + second_word)
    plt.show()
  return predicted_words[0]

def generate_sentence_tri(predict_fn, trigram_model):
  '''
  Generates a sentence using the start token '<s>' as the first two words.
  '''
  first_word = predict_fn(trigram_model)
  sentence = [first_word, predict_fn(trigram_model, first_word)]  
  while 1>0:
    next_word = predict_fn(trigram_model, sentence[-1], sentence[-2])
    sentence.append(next_word)
    if next_word == '</s>':
      break
  return " ".join(sentence[:-1])

In [None]:
#Generating the trigram data, building the trigram model and generating a sentence.

trigrams_data = find_ngrams(words_data, 3)
trigram_model = build_trigram_model(trigrams_data)
print("Sentence generated by the trigram model for the most common first word:")
generate_sentence_tri(predict_next_word_tri, trigram_model)

Sentence generated by the trigram model for the most common first word:


'the copyright holder for this preprint this version posted june 9 2020'

In [None]:
def build_fgram_model(fgrams_data, withcounts = False):
  '''
  Build trigram model using trigrams generated from data. Counts occure of each trigram
  and gives the likelihood of each trigram.
  '''
  fgram_model_counts = collections.defaultdict(lambda : collections.defaultdict(lambda : collections.defaultdict(lambda : collections.defaultdict(lambda : 0))))
  for gram in fgrams_data:
    fgram_model_counts[gram[0]][gram[1]][gram[2]][gram[3]] +=1
  fgram_model = fgram_model_counts
  for w1 in fgram_model:
    for w2 in fgram_model[w1]:
      for w3 in fgram_model[w1][w2]:
        total = float(sum(fgram_model[w1][w2][w3].values()))
        for w4 in fgram_model[w1][w2][w3]:
          fgram_model[w1][w2][w3][w4] /= total
  if withcounts:
    return fgram_model, fgram_model_counts
  else:
    return fgram_model

def predict_next_word_f(fgram_model, third_word = '<s>', second_word='<s>', first_word='<s>', shouldplot = False):
  '''
  Predicts the next word using probabilities of trigram model and given firat and second word.
  '''
  fourth_word = fgram_model[first_word][second_word][third_word]
  top10words = collections.Counter(fourth_word).most_common(10)

  predicted_words = list(zip(*top10words))[0]
  prob_scores = list(zip(*top10words))[1]
  x_pos = np.arange(len(predicted_words))

  if shouldplot:
    plt.bar(x_pos, prob_scores, align = 'center')
    plt.xticks(x_pos, predicted_words)
    plt.ylabel('Probability Score')
    plt.xlabel('Predicted Words')
    plt.title('Predicted words for ' + first_word + ' ' + second_word)
    plt.show()
  return predicted_words[0]

def generate_sentence_f(predict_fn, fgram_model):
  '''
  Generates a sentence using the start token '<s>' as the first two words.
  '''
  first_word = predict_fn(fgram_model)
  second_word = predict_fn(fgram_model, first_word)
  sentence = [first_word, second_word, predict_fn(fgram_model, second_word, first_word)]  
  while 1>0:
    next_word = predict_fn(fgram_model, sentence[-1], sentence[-2], sentence[-3])
    sentence.append(next_word)
    if next_word == '</s>':
      break
  return " ".join(sentence[:-1])

In [None]:
#Generating the 4-gram data, building the 4-gram model and generating a sentence.

fgrams_data = find_ngrams(words_data, 4)
fgram_model = build_fgram_model(fgrams_data)
print("Sentence generated by 4-gram model for the most common first word:")
generate_sentence_f(predict_next_word_f, fgram_model)

Sentence generated by 4-gram model for the most common first word:


'the copyright holder for this preprint this version posted june 9 2020'

In [None]:
#Splitting the trigrams into train and test sets.

random.shuffle(trigrams_data)
l = len(trigrams_data)
tri_train = trigrams_data[:(divmod(l,9)[0])]
tri_test = trigrams_data[(divmod(l,9)[0]):]
tri_no_test = len(tri_test)

#Adding the cross-entropy loss for each trigram in the test set.

tri_model = build_trigram_model(tri_train)
loss = 0
for trigram in tri_test:
  prob_correct = max(tri_model[trigram[0]][trigram[1]][trigram[2]], 0.000001)
  loss += (-1)*math.log10(prob_correct)
print("Average Cross Entropy Loss of the trigram model is "+str(loss/tri_no_test))

Average Cross Entropy Loss of the trigram model is 4.709567274267211


In [None]:
#Splitting the 4-grams into train and test sets.

random.shuffle(fgrams_data)
l = len(fgrams_data)
f_train = fgrams_data[:(divmod(l,9)[0])]
f_test = fgrams_data[(divmod(l,9)[0]):]
f_no_test = len(f_test)

#Adding the cross-entropy loss for each 4-gram in the test set.

f_model = build_fgram_model(f_train)
loss = 0
for fgram in f_test:
  prob_correct = max(f_model[fgram[0]][fgram[1]][fgram[2]][fgram[3]], 0.000001)
  loss += (-1)*math.log10(prob_correct)
print("Average Cross Entropy Loss of the 4-gram model is "+str(loss/f_no_test))

Average Cross Entropy Loss of the 4-gram model is 5.070950454264896


Note: The largest possible cross entropy loss for ngrams that exist in the data is -log(1/number of ngrams). Hence as long as the loss for the zero occurence case is set to higher than that, the loss is valid. So the loss for zero occurence is set to 6.

## Trigram Language Model Vs 4-gram Language Model

The observed average cross-entropy loss of each ngram in the test set is lower for the trigram model. This goes against our expectations because one would think a more robust model like the 4-gram would do better. This may come down to a lack of data because 350 papers doesn't do a good job representing all possible ngrams of words.


## Handling Large Set of Parameters

If stored naively a trigram model would be of order (vocabulary)^3. Therefore storing only the necessary trigrams is very important since the counts in model are very sparse. A default dictionary is very helpful in this situation because initialising the counts is not necessary.

## Executing sections of the code parallelly

The trigram and the 4-gram portions of the code can be executed seperately. Actually, if written differently all steps of each model building process can be performed simultaneously. Each sentence can be broken down into ngrams and used to build the model parallely. 