In [5]:
# author: Rthvik Raviprakash
# email: rthvik.07@gmail.com
# date: 02/25/2023

In [6]:
# import the required libraries
from collections import Counter
import numpy as np
import pandas as pd
import re

In [7]:
file_str = "" # The path of the file that contains that training tokens. In my case it is of sentences form line by line.
a = np.loadtxt(file_str,dtype = "str", delimiter = "\n") # The output shown is an example from the file that I had used.
a

array(['Having a little flexibility on that issue would go a long way to putting together a final package .',
       'Long before the advent of e-commerce , Wal-Mart \'s founder Sam Walton set out his vision for a successful retail operation : " We let folks know we \'re interested in them and that they \'re vital to us-- \' cause they are , " he said .',
       'A spokesman said the company has been affected by the credit crunch in the United States .',
       ..., 'Wisconsin ( 11 ) vs Ohio St. , 4 p.m.',
       'Of course , they think that a lot of this stuff is genetic .',
       'By the age of seven , the boy had been seen by doctors more than 325 times and undergone nine operations .'],
      dtype='<U2467')

In [8]:
# Get the unigram counts of each word in the vocabulary
cnt = {}
with open(file_str) as file:
  for line in file:
    a = line.split()
    a.append('<s t o p>')
    ar_sub = ['<s t a r t>'] + a
    for value in ar_sub:
      if value not in cnt:
        cnt[value] = 1
      else:
        cnt[value] += 1

In [9]:
# A dictionary of unknown words
k = 5 # You can change the value of k based on what you want to consider and unknown token to be. (In my case k = 5 which means that any
# word that occurs less than 5 times in the whole document I will give in an "<unk>" token)
word_dict = {word : "<u n k>" for word, occurrences in cnt.items() if occurrences < k}

In [10]:
# The vocab counts is a dictionoary that contains the counts of each unique word in the document (which is basically the vocabulary).
vocab_counts = {}
vocab_counts['<u n k>'] = 0

for k,v in cnt.items():
  if v >= 5:
    vocab_counts[k] = v
  else:
    vocab_counts['<u n k>'] += v

In [11]:
N = 0
for k,v in vocab_counts.items():
  N += v # N is the total number of words in the document (Note: this is not the unique number of words that are in the document, since that would be V)

p_unk = vocab_counts['<u n k>']/N

# Unigram Model
V = len(vocab_counts) # This is the total number of words in the vocabulary
words_li = []
prob_li = []

word_prob_uni = {}

for k,v in vocab_counts.items():
  words_li.append(k)
  prob_li.append(v/N)
  word_prob_uni[k] = v/N

di = {'Word': words_li, 'Probability' : prob_li}
df = pd.DataFrame(data = di) # This dataframe consists of each word(token) and it's respective probabilities under the unigram model.
df

Unnamed: 0,Word,Probability
0,<u n k>,0.055330
1,<s t a r t>,0.036529
2,Having,0.000028
3,a,0.018674
4,little,0.000275
...,...,...
18115,paraded,0.000003
18116,CSR,0.000003
18117,SiRF,0.000003
18118,helpless,0.000003


In [12]:
# This function takes in a sentence and a dictionary that contains words labeled as unknown and returns the trigrams for that sentence
# For example: If we have a sentence: "This is a unigram model." it would return a list [("<start>", "<start>", "<start>"), ("<start>", "<start>", "This")
# ("<start>", "This", "is"), ("This", "is", "a"), ("is", "a", "unigram"), ("a", "unigram", "model"), ("unigram", "model", "<stop>")]
# If any of the words in that sentence exists in the unknown dictionary, it will return with that word's label as "<unk>"

def token_tri(sentence, unk_dict):
  arr_sub = sentence.split()
  arr_sub.append("<s t o p>")
  arr_tri = ["<s t a r t>", "<s t a r t>"] + arr_sub
  tri_gram = []

  if len(arr_tri) == 3:
    return []
  j = 0

  while j < len(arr_tri) - 2:
    if arr_tri[j] not in unk_dict:
      if arr_tri[j + 1] not in unk_dict:
        if arr_tri[j+2] not in unk_dict:
          tri_gram.append((arr_tri[j], arr_tri[j+1], arr_tri[j+2]))
        else:
          tri_gram.append((arr_tri[j], arr_tri[j+1], '<u n k>'))
      else:
        if arr_tri[j+2] not in unk_dict:
          tri_gram.append((arr_tri[j],'<u n k>' , arr_tri[j+2]))
        else:
          tri_gram.append((arr_tri[j], '<u n k>', '<u n k>'))
      j += 1
    else:
      if arr_tri[j + 1] not in unk_dict:
        if arr_tri[j+2] not in unk_dict:
          tri_gram.append(('<u n k>', arr_tri[j+1], arr_tri[j+2]))
        else:
          tri_gram.append(('<u n k>', arr_tri[j+1], '<u n k>'))
      else:
        if arr_tri[j+2] not in unk_dict:
          tri_gram.append(('<u n k>','<u n k>' , arr_tri[j+2]))
        else:
          tri_gram.append(('<u n k>', '<u n k>', '<u n k>'))
      j += 1

  return tri_gram

In [14]:
import string

# Function for tokenizing a sentence
def tokenize(sentence):
  return sentence.split()

# Function for tokenizing a sentence into a bigram.
def token_bi(sentence, unk_dict):
  arr = sentence.split()
  bi_gram = []

  if len(arr) == 0:
    return bi_gram
  
  if len(arr) == 1:
    if arr[0] not in unk_dict:
      return [('<s t a r t>', arr[0]), (arr[0], '<s t o p>')]
    else:
      return [('<s t a r t>', '<u n k>'), ('<u n k>', '<s t o p>')]

  i = 0

  while i < len(arr):
    if i == 0:
      if arr[i] not in unk_dict:
        bi_gram.append(('<s t a r t>', arr[i]))
      else:
        bi_gram.append(('<s t a r t>', '<u n k>'))
      i += 1

    elif i == len(arr) - 1:
      if arr[i] not in unk_dict:
        if arr[i-1] not in unk_dict:
          bi_gram.append((arr[i-1], arr[i]))
          bi_gram.append((arr[i], '<s t o p>'))
        else:
          bi_gram.append(('<u n k>', arr[i]))
          bi_gram.append((arr[i], '<s t o p>'))
      else:
        if arr[i-1] not in unk_dict:
          bi_gram.append((arr[i-1], '<u n k>'))
          bi_gram.append(('<u n k>', '<s t o p>'))
        else:
          bi_gram.append(('<u n k>', '<u n k>'))
          bi_gram.append(('<u n k>', '<s t o p>'))
      i += 1

    else:
      if arr[i] not in unk_dict:
        if arr[i-1] not in unk_dict:
          bi_gram.append((arr[i-1], arr[i]))
        else:
          bi_gram.append(('<u n k>', arr[i]))
      else:
        if arr[i-1] not in unk_dict:
          bi_gram.append((arr[i-1], '<u n k>'))
        else:
          bi_gram.append(('<u n k>', '<u n k>'))
      i += 1
  
  return bi_gram

In [15]:
# Get the bigram counts for each word in the vocabulary
cnt_bi = {}
cnt_half_bi = 0
f_name = "" # The path of the file that contains that training tokens. In my case it is of sentences form line by line.
with open(f_name) as file:
  for line in file:
    arr = [('<s t a r t>', '<s t a r t>')] + token_bi(line, word_dict)
    for bi in arr:
      if bi not in cnt_bi:
        cnt_bi[bi] = 1
      else:
        cnt_bi[bi] += 1

cnt_bi

{('<s t a r t>', '<s t a r t>'): 61530,
 ('<s t a r t>', 'Having'): 43,
 ('Having', 'a'): 4,
 ('a', 'little'): 165,
 ('little', 'flexibility'): 1,
 ('flexibility', 'on'): 1,
 ('on', 'that'): 46,
 ('that', 'issue'): 5,
 ('issue', 'would'): 4,
 ('would', 'go'): 26,
 ('go', 'a'): 2,
 ('a', 'long'): 117,
 ('long', 'way'): 16,
 ('way', 'to'): 170,
 ('to', 'putting'): 3,
 ('putting', 'together'): 4,
 ('together', 'a'): 11,
 ('a', 'final'): 30,
 ('final', 'package'): 1,
 ('package', '.'): 11,
 ('.', '<s t o p>'): 58015,
 ('<s t a r t>', 'Long'): 9,
 ('Long', 'before'): 3,
 ('before', 'the'): 296,
 ('the', 'advent'): 4,
 ('advent', 'of'): 5,
 ('of', 'e-commerce'): 1,
 ('e-commerce', ','): 2,
 (',', 'Wal-Mart'): 5,
 ('Wal-Mart', "'s"): 2,
 ("'s", 'founder'): 7,
 ('founder', 'Sam'): 1,
 ('Sam', 'Walton'): 1,
 ('Walton', 'set'): 1,
 ('set', 'out'): 21,
 ('out', 'his'): 18,
 ('his', 'vision'): 7,
 ('vision', 'for'): 9,
 ('for', 'a'): 1086,
 ('a', 'successful'): 18,
 ('successful', 'retail'): 1,
 (

In [16]:
# Bi grams
cnt_bi

prob_list_bi = []
tups = []
word_prob_bi = {}

for tup,v in cnt_bi.items():
  tups.append(tup)
  prob_list_bi.append(v/vocab_counts[tup[0]])
  word_prob_bi[tup] = v/vocab_counts[tup[0]]

N_bi = 0
for k,v in cnt_bi.items():
  N_bi += v

di_bi = {'Word_bi': tups, 'Probability' : prob_list_bi}
df_bi = pd.DataFrame(data = di_bi)
df_bi

Unnamed: 0,Word_bi,Probability
0,"(<s t a r t>, <s t a r t>)",1.000000
1,"(<s t a r t>, Having)",0.000699
2,"(Having, a)",0.085106
3,"(a, little)",0.005246
4,"(little, flexibility)",0.002160
...,...,...
464262,"(than, 325)",0.000395
464263,"(325, times)",0.166667
464264,"(and, undergone)",0.000031
464265,"(undergone, nine)",0.090909


In [17]:
log_sum_bi = 0
for i in range(len(prob_list_bi)):
  log_sum_bi += np.log(prob_list_bi[i])*cnt_bi[tups[i]]

ppx_bi_train = np.exp((-1/N_bi)*log_sum_bi)
print("The bigram perplexity on the training set: ", round(ppx_bi_train, 3))

The bigram perplexity on the training set:  64.578


In [18]:
# This is for the Test set
# Trying to use laplace smoothing 

#cnt_tri_sub = {}
prob_uni_test = []
prob_bi_test = []
prob_tri_test = []
uni_word_test = []
bi_word_test = []
tri_word_test = []
alpha = 1
f_test = "" # The path of the file that contains that test tokens. In my case it is of sentences form line by line.

with open(f_test) as file:
  for line in file:
    arr = token_tri(line, word_dict)
    for tri in arr:
      bi_word_test.append((tri[1], tri[2]))
      bi = (tri[1], tri[2])

      if bi in word_prob_bi:
        prob_bi_test.append(word_prob_bi[bi])
      else:
        if tri[1] in word_prob_uni:
          prob_bi_test.append(alpha/(vocab_counts[tri[1]] + V))
        else:
          prob_bi_test.append(alpha/(vocab_counts['<u n k>'] + V))
      
di_bi_test = {'Word_bi_test': bi_word_test, 'Prob_bi_test': prob_bi_test}
df_bi_test = pd.DataFrame(data = di_bi_test)
df_bi_test

Unnamed: 0,Word_bi_test,Prob_bi_test
0,"(<s t a r t>, BAGHDAD)",0.000195
1,"(BAGHDAD, --)",0.166667
2,"(--, An)",0.001867
3,"(An, Iraqi)",0.012245
4,"(Iraqi, military)",0.013793
...,...,...
318281,"('s, personal)",0.000795
318282,"(personal, adviser)",0.000055
318283,"(adviser, ])",0.000055
318284,"(], .)",0.074534


In [19]:
N_bi_test = len(prob_bi_test)
log_sum_bi_test = 0
for i in range(N_bi_test):
  log_sum_bi_test += np.log(prob_bi_test[i])

ppx_test_bi = np.exp(log_sum_bi_test * (-1/N))
print("The bigram perplexity of the test set: ", ppx_test_bi)

The bigram perplexity of the test set:  2.8990647009228896
