# The Unigram model. (Please read comments for further understanding)



In [None]:
# author: Rthvik Raviprakash
# email: rthvik.07@gmail.com
# date: 02/25/2023

In [None]:
# import the required libraries
from collections import Counter
import numpy as np
import pandas as pd
import re

In [2]:
# import nltk if required. I am not using it since I build my uni-gram model from scratch. nltk is a good way to cross check
# if your perplexity is in an acceptable range.
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [3]:
file_str = "" # The path of the file that contains that training tokens. In my case it is of sentences form line by line.
a = np.loadtxt(file_str,dtype = "str", delimiter = "\n") # The output shown is an example from the file that I had used.
a

array(['Having a little flexibility on that issue would go a long way to putting together a final package .',
       'Long before the advent of e-commerce , Wal-Mart \'s founder Sam Walton set out his vision for a successful retail operation : " We let folks know we \'re interested in them and that they \'re vital to us-- \' cause they are , " he said .',
       'A spokesman said the company has been affected by the credit crunch in the United States .',
       ..., 'Wisconsin ( 11 ) vs Ohio St. , 4 p.m.',
       'Of course , they think that a lot of this stuff is genetic .',
       'By the age of seven , the boy had been seen by doctors more than 325 times and undergone nine operations .'],
      dtype='<U2467')

In [5]:
# Get the unigram counts of each word in the vocabulary
cnt = {}
with open(file_str) as file:
  for line in file:
    a = line.split()
    a.append('<s t o p>')
    ar_sub = ['<s t a r t>'] + a
    for value in ar_sub:
      if value not in cnt:
        cnt[value] = 1
      else:
        cnt[value] += 1

In [6]:
# A dictionary of unknown words
k = 5 # You can change the value of k based on what you want to consider and unknown token to be. (In my case k = 5 which means that any
# word that occurs less than 5 times in the whole document I will give in an "<unk>" token)
word_dict = {word : "<u n k>" for word, occurrences in cnt.items() if occurrences < k}

In [7]:
# The vocab counts is a dictionoary that contains the counts of each unique word in the document (which is basically the vocabulary).
vocab_counts = {}
vocab_counts['<u n k>'] = 0

for k,v in cnt.items():
  if v >= 5:
    vocab_counts[k] = v
  else:
    vocab_counts['<u n k>'] += v

In [8]:
N = 0
for k,v in vocab_counts.items():
  N += v # N is the total number of words in the document (Note: this is not the unique number of words that are in the document, since that would be V)

p_unk = vocab_counts['<u n k>']/N

# Unigram Model
V = len(vocab_counts) # This is the total number of words in the vocabulary
words_li = []
prob_li = []

word_prob_uni = {}

for k,v in vocab_counts.items():
  words_li.append(k)
  prob_li.append(v/N)
  word_prob_uni[k] = v/N

di = {'Word': words_li, 'Probability' : prob_li}
df = pd.DataFrame(data = di) # This dataframe consists of each word(token) and it's respective probabilities under the unigram model.
df

Unnamed: 0,Word,Probability
0,<u n k>,0.055330
1,<s t a r t>,0.036529
2,Having,0.000028
3,a,0.018674
4,little,0.000275
...,...,...
18115,paraded,0.000003
18116,CSR,0.000003
18117,SiRF,0.000003
18118,helpless,0.000003


In [9]:
# Calculate Perplexity
import numpy as np
log_sum = 0
for i in range(len(prob_li)):
  log_sum += np.log(prob_li[i])*vocab_counts[words_li[i]]

ppx_uni_train = np.exp((-1/N)*log_sum)

print("The unigram perplexity on the training set: ", round(ppx_uni_train, 3))

The unigram perplexity on the training set:  736.104


In [10]:
# This function takes in a sentence and a dictionary that contains words labeled as unknown and returns the trigrams for that sentence
# For example: If we have a sentence: "This is a unigram model." it would return a list [("<start>", "<start>", "<start>"), ("<start>", "<start>", "This")
# ("<start>", "This", "is"), ("This", "is", "a"), ("is", "a", "unigram"), ("a", "unigram", "model"), ("unigram", "model", "<stop>")]
# If any of the words in that sentence exists in the unknown dictionary, it will return with that word's label as "<unk>"

def token_tri(sentence, unk_dict):
  arr_sub = sentence.split()
  arr_sub.append("<s t o p>")
  arr_tri = ["<s t a r t>", "<s t a r t>"] + arr_sub
  tri_gram = []

  if len(arr_tri) == 3:
    return []
  j = 0

  while j < len(arr_tri) - 2:
    if arr_tri[j] not in unk_dict:
      if arr_tri[j + 1] not in unk_dict:
        if arr_tri[j+2] not in unk_dict:
          tri_gram.append((arr_tri[j], arr_tri[j+1], arr_tri[j+2]))
        else:
          tri_gram.append((arr_tri[j], arr_tri[j+1], '<u n k>'))
      else:
        if arr_tri[j+2] not in unk_dict:
          tri_gram.append((arr_tri[j],'<u n k>' , arr_tri[j+2]))
        else:
          tri_gram.append((arr_tri[j], '<u n k>', '<u n k>'))
      j += 1
    else:
      if arr_tri[j + 1] not in unk_dict:
        if arr_tri[j+2] not in unk_dict:
          tri_gram.append(('<u n k>', arr_tri[j+1], arr_tri[j+2]))
        else:
          tri_gram.append(('<u n k>', arr_tri[j+1], '<u n k>'))
      else:
        if arr_tri[j+2] not in unk_dict:
          tri_gram.append(('<u n k>','<u n k>' , arr_tri[j+2]))
        else:
          tri_gram.append(('<u n k>', '<u n k>', '<u n k>'))
      j += 1

  return tri_gram

In [14]:
# Get the dataframe of uni, bi and tri of each word with it's probabilities

# I am using Laplace smoothing for the words that don't exist in the training set but occurs in the test set to avoid having zeros.

test_file = "" # The path of the file that contains that test tokens. In my case it is of sentences form line by line.

prob_uni_test = []
prob_bi_test = []
prob_tri_test = []
uni_word_test = []
bi_word_test = []
tri_word_test = []
alpha = 1 # This value is a hyperparameter which can be tuned using the development data.

# So using the same code that I am using for the test set you can load in your development set data instead of the test set data
# and tune your hyperparameters.

with open(test_file) as file:
  for line in file:
    arr = token_tri(line, word_dict)
    for tri in arr:
      uni_word_test.append((tri[2]))
      uni = tri[2]

      if uni in word_prob_uni:
        prob_uni_test.append(word_prob_uni[uni])
      else:
        prob_uni_test.append(alpha/(N + V))
      
di_uni_test = {'Word_uni_test': uni_word_test, 'Prob_uni_test': prob_uni_test}
df_uni_test = pd.DataFrame(data = di_uni_test) # This dataframe returns each word in the test set and it's respective probailities
# based on the training probabilities.
df_uni_test

Unnamed: 0,Word_uni_test,Prob_uni_test
0,BAGHDAD,0.000007
1,--,0.002226
2,An,0.000145
3,Iraqi,0.000086
4,military,0.000280
...,...,...
318281,personal,0.000110
318282,adviser,0.000045
318283,],0.000096
318284,.,0.036184


In [16]:
# Calculate the perplexity on the test set
N_uni_test = len(prob_uni_test)
log_sum_uni_test = 0

for i in range(N_uni_test):
  log_sum_uni_test += np.log(prob_uni_test[i])

ppx_test_uni = np.exp(log_sum_uni_test * (-1/N))
print("The unigram perplexity of the test set: ", ppx_test_uni)

The unigram perplexity of the test set:  3.735634148416777
