In [1]:
#Mount my Google Drive.
#from google.colab import drive
#drive.mount("/content/drive")
#import os
#directory = '/content/drive/My Drive/CSC583'
#os.chdir(directory)

#Ensure the files are there (in the folder).
#!pwd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/CSC583


### **Some Important Import's**

In [1]:
#Token dictionary.
import pandas as pd
from collections import defaultdict

#N-gram models.
import nltk
from nltk.probability import ConditionalFreqDist, FreqDist
from nltk.util import ngrams

#Perplexity.
import math
import itertools
#Sentence generation.
import random

#**Part I: N-gram Modeling and Perplexity**

## **Load Datasets**

**Get both files:**

urlTrain = "https://condor.depaul.edu/ntomuro/courses/583/2024fall/assign/HW4/ngram/1b_benchmark.train.tokens"

file_nameTrain = "1b_benchmark_train_tokens.txt"  
response = requests.get(urlTrain)

with open(file_nameTrain, 'w', encoding='utf-8') as file:
    file.write(response.text)
print(f"File saved as {file_nameTrain}")

urlTest = "https://condor.depaul.edu/ntomuro/courses/583/2024fall/assign/HW4/ngram/1b_benchmark.test.tokens"

file_nameTest = "1b_benchmark_test_tokens.txt"  
response = requests.get(urlTest)

with open(file_nameTest, 'w', encoding='utf-8') as file:
    file.write(response.text)
print(f"File saved as {file_nameTest}")

## **Pre-step: Build the Vocabulary**

In [6]:
def loadData(fileName):
  '''Reading the data from a text file, each line/sentence is an observation.'''
  with open(fileName, 'r', encoding='utf-8') as inFile:
    content = inFile.readlines()
  data = pd.DataFrame(content, columns=['sentence'])
  print(f'Number of (rows, columns): {data.shape}')
  return data

In [7]:
trainData = loadData("1b_benchmark_train_tokens.txt")
trainData.head(1)

Number of (rows, columns): (61530, 1)


Unnamed: 0,sentence
0,Having a little flexibility on that issue woul...


In [8]:
def vocabDict(data):
  '''Generate vocab dictionary of the data. Take input as a pandas dataframe.
  Note: Tokens that occurred (strictly) < 3 times into a special symbol '<unk>'.
        Add a symbol '<STOP>' to finalize the vocabulary.'''

  vocabDict = defaultdict(int)
  #Tokenize each sentence and count the token occurrences.
  for sentence in data['sentence']:
    tokens = sentence.split()
    #Vocabulary only with <STOP>.
    for token in tokens:
      vocabDict[token] += 1
    vocabDict['<STOP>'] += 1

  #Replace tokens with less than 3 occurrences with <unk>.
  final_vocabDict = {token: count for token, count in vocabDict.items() if count >= 3}
  final_vocabDict['<unk>'] = sum(count for count in vocabDict.values() if count < 3)

  return final_vocabDict

In [9]:
vocabDict_train = vocabDict(trainData)
print(f"Number of unique tokens - without '<START>': {len(vocabDict_train)}")
print('----First five tokens----')
for token, count in list(vocabDict_train.items())[:5]:
    print(f"{token}: {count}")

Number of unique tokens - without '<START>': 26602
----First five tokens----
Having: 47
a: 31455
little: 463
flexibility: 16
on: 11543


## **Create N-gram models -- using NLTK**

*   From a line in the training data, create a list of tokens by first splitting tokens by white spaces, then converting tokens to those in the vocabulary.
*   Pass the list of tokens to the NLTK's library function nltk.ngrams(list_of_tokens, n) to obtain ngrams for a given n (2 for bigram and 3 for trigram). The function returns a list of all ngrams made the list of tokens.

In [10]:
def gen_ngramsList(data, vocabDict):
  '''Obtain ngrams list for a given n. Create columns in pandas data frame.
  Return: List of all ngrams.'''
  unigramList = []
  bigramList = []
  trigramList = []

  for sentence in data['sentence']:
    tokens = sentence.split()
    #Tokenize a sentence. If words don't exist in vocabDict, treat as <unk>.
    tokens = [token if token in vocabDict else '<unk>' for token in tokens]

    #Unigrams | No <START> token.
    unigrams = list(nltk.ngrams(tokens + ['<STOP>'], 1))
    unigramList.append(unigrams)

    #Bigrams | One <START> token.
    bigrams = list(nltk.ngrams(['<START>'] + tokens + ['<STOP>'], 2))
    bigramList.append(bigrams)

    #Trigrams | Two <START> tokens.
    trigrams = list(nltk.ngrams(['<START>', '<START>'] + tokens + ['<STOP>'], 3))
    trigramList.append(trigrams)

  data['unigrams'] = unigramList
  data['bigrams'] = bigramList
  data['trigrams'] = trigramList

  return data

In [13]:
trainData = gen_ngramsList(trainData, vocabDict_train)
trainData.head(1)

Unnamed: 0,sentence,unigrams,bigrams,trigrams
0,Having a little flexibility on that issue woul...,"[(Having,), (a,), (little,), (flexibility,), (...","[(<START>, Having), (Having, a), (a, little), ...","[(<START>, <START>, Having), (<START>, Having,..."


### **Applicable for bigram and trigram only --> Creates a conditional frequency distribution**
*   For each n (bigram/trigram), pass the list of ngrams to the nltk.ConditionalFreqDist(list_of_ngrams) function. This function receives a list of 2-tuples/pairs, and returns a nested dictionary where keys are the first element of the pairs and the values are the frequency distribution dictionary (nltk.FreqDist() dictionary) of the second element of the pairs.



In [14]:
def gen_ConditionalFreqDist(data, columnName):
  '''Generate conditional frequency distribution for the entire corpus
  based on the n-1 leading words within a tuple.'''

  freqDist = ConditionalFreqDist()
  for ngrams in data[columnName]:
    for ngram in ngrams:
      #Key: All tokens except for last one -- within a tuple.
      firstElement = ngram[:-1]
      #Last token -- within a tuple.
      secondElement = ngram[-1]
      freqDist[firstElement][secondElement] += 1

  return freqDist

In [15]:
bigram_freqDist = gen_ConditionalFreqDist(trainData, 'bigrams')
print("Bigram ('Clinton', 'leading') frequency:", bigram_freqDist[('Clinton',)]['leading'])

Bigram ('Clinton', 'leading') frequency: 1


In [25]:
trigram_freqDist = gen_ConditionalFreqDist(trainData, 'trigrams')
print("Trigram ('I', 'believe', 'he') frequency:", trigram_freqDist[('I', 'believe')]['he'])

Trigram ('I', 'believe', 'he') frequency: 1


### **Finally, creating Ngram language models.**
*   Simply Ngram probability models. Simply access the ConditonalFreqDist (for each n (bigram/trigram) by calling freq() in NLTK.

### **Unigram models.**

In [16]:
#Let's look at total number of tokens, which will be the denominator.
#NOT using RAW token count.
#Use count after vocabulary manipulation which including tokens <UNK>, <STOP>.

#Also, spot check word count.
denominator  = []
word = 'Reihana'
count = 0
for sent in trainData['sentence']:
  tokens = sent.split()
  tokens = [token if token in vocabDict_train else '<unk>' for token in tokens]
  for t in tokens:
    if t == word:
      count += 1
  unigrams = list(nltk.ngrams(tokens + ['<STOP>'], 1))
  denominator.extend(unigrams)
print(f'Total number of token post manipulation: {len(denominator)}')
print(f"'{word}' occcurs {count} times.")

Total number of token post manipulation: 1622907
'Reihana' occcurs 3 times.


In [17]:
def unigram(data):
  '''Calculate unigram probabilities for non-smoothing and Laplace smoothing.
  Return: Two dictionaries.'''
  unigramModel = FreqDist()

  #Using unigram column.
  for unigrams in data['unigrams']:
    #Iterate through each token, get count.
    for unigram in unigrams:
      unigramModel[unigram[0]] += 1

  #Total token count, include repetitons.
  tokenCount = unigramModel.N()
  unigramProb = {word: unigramModel[word] / tokenCount for word in unigramModel}

  #Laplace smoothing.
  vocabSize = len(unigramModel)
  laplace_unigramProb = {word: (unigramModel[word] + 1) / (tokenCount + vocabSize) for word in unigramModel}
  return unigramProb, laplace_unigramProb

In [18]:
unigramProb, laplace_unigramProb = unigram(trainData)
#Since 'Reihana' occurs 3 times in the train corpus --> Let's deemed as a rare word.
#Laplace smoothing basically boosts occurrences of rare/unseen words.
#Hence resulting higher probability for 'Reihana'.
print("Unigram probability | non-smoothing for 'Reihana':", unigramProb.get('Reihana', 0))
print("Unigram probability | Laplace smoothing for 'Reihana':", laplace_unigramProb.get('Reihana', 0))

Unigram probability | non-smoothing for 'Reihana': 1.8485347589233394e-06
Unigram probability | Laplace smoothing for 'Reihana': 2.4249640347521596e-06


In [19]:
#-----Let's test my code with sample Ngrams -- Example Scratch code.-----
sentences = [[token if token in vocabDict_train else '<unk>' for token in s.split()] for s in trainData['sentence']]
for sent in sentences:
    sent.append('<STOP>')

def flatten(sents):
  '''Un-nest master list.'''
  return [token for sent in sents for token in sent]
newlist = flatten(sentences)

#Unigram.
fdist1 = nltk.probability.FreqDist(newlist)
print(fdist1.freq('Reihana'))

1.8485347589233394e-06


### **Bigram models.**

In [20]:
def bigram(bigram_freqDist, vocabDict, sentenceCount):
  '''Calculate bigram probabilities for non-smoothing and Laplace smoothing.
  Return: Two dictionaries.
  Note: Only calculate Laplace smoothing probability for existing/seen bigrams.'''

  bigramProb = {}
  laplace_bigramProb = {}
  vocabSize = len(vocabDict)

  #Iterate through each word with respect to first element.
  for firstEle in bigram_freqDist:
    #Since '<START>' does not exist in vocab dictionary.
    #If '<START>' found, its count = Number of sentences in train data.
    #Otherwiuse, get count of the first word (context). If cannot be found, treat as 0.
    first_wordCount = sentenceCount if firstEle[0] == '<START>' else vocabDict.get(firstEle[0], 0)

    for word in bigram_freqDist[firstEle]:
      #Non-smoothed.
      prob = bigram_freqDist[firstEle].freq(word)
      bigramProb[(firstEle[0], word)] = prob

      #Laplace smoothing.
      bigramCount = bigram_freqDist[firstEle][word]
      laplaceProb = (bigramCount + 1) / (first_wordCount + vocabSize)
      laplace_bigramProb[(firstEle[0], word)] = laplaceProb

  return bigramProb, laplace_bigramProb

def unseenBigram(firstW, secondW, laplace_bigramProb, vocabDict, sentenceCount):
  '''Calculate on the fly unseen bigram probabilities for Laplace smoothing.'''

  vocabSize=len(vocabDict)
  first_wordCount = sentenceCount if firstW == '<START>' else vocabDict.get(firstW, 0)
  laplaceProb = 1 / (first_wordCount + vocabSize)

  return laplaceProb

In [21]:
bigramProb, laplace_bigramProb = bigram(bigram_freqDist, vocabDict_train, trainData.shape[0])
print("Bigram probability | non-smoothing for ('a', 'little'):", bigramProb.get(('a', 'little'), 0))
print("Bigram probability | Laplace smoothing for ('a', 'little'):", laplace_bigramProb.get(('a', 'little'), 0))

Bigram probability | non-smoothing for ('a', 'little'): 0.005245588936576061
Bigram probability | Laplace smoothing for ('a', 'little'): 0.002859259004082195


In [22]:
#Let's test my code with sample Ngrams -- Example Scratch code.
sentences = [[token if token in vocabDict_train else '<unk>' for token in s.split()] for s in trainData['sentence']]
for sent in sentences:
    sent.append('<STOP>')
bigram_list = [list(ngrams(sentence, 2, pad_left=True, left_pad_symbol='<START>')) for sentence in sentences]
cfd = nltk.ConditionalFreqDist(flatten(bigram_list))
print (cfd['a'].freq('little'))

0.005245588936576061


### **Trigram models.**

In [26]:
def trigram(trigram_freqDist, bigram_freqDist, vocabDict, sentenceCount):
  '''Calculate trigram probabilities for non-smoothing and Laplace smoothing.
  Return: Two dictionaries - non-smoothed trigram probabilities and Laplace-smoothed trigram probabilities.'''

  trigramProb = {}
  laplace_trigramProb = {}
  vocabSize = len(vocabDict)

  for first_twoWords in trigram_freqDist:
    #Since ('<START>','<START>') does not exist in bigram conditional frequency.
    if first_twoWords == ('<START>','<START>'):
      #Its count = Number of sentences in train data.
      first_two_wordCount = sentenceCount
    else:
      #Count of the first two words (context). If cannot retrieve, treat as 0.
      first_two_wordCount = bigram_freqDist.get((first_twoWords[0],), {}).get(first_twoWords[1], 0)

    for thirdW in trigram_freqDist[first_twoWords]:
      #Non-smoothed.
      prob = trigram_freqDist[first_twoWords].freq(thirdW)
      trigramProb[(first_twoWords[0], first_twoWords[1], thirdW)] = prob

      #Laplace smoothing.
      trigramCount = trigram_freqDist[first_twoWords][thirdW]
      laplaceProb = (trigramCount + 1) / (first_two_wordCount + vocabSize)
      laplace_trigramProb[(first_twoWords[0], first_twoWords[1], thirdW)] = laplaceProb

  return trigramProb, laplace_trigramProb

def unseenTrigram(firstW, secondW, thirdW, laplace_trigramProb, bigram_freqDist, vocabDict, sentenceCount):
  '''Calculate on the fly unseen bigram probabilities for Laplace smoothing.'''

  vocabSize = len(vocabDict)
  if firstW == '<START>' and secondW == '<START>':
    first_two_wordCount = sentenceCount
  else:
    first_two_wordCount = bigram_freqDist.get((firstW,), {}).get(secondW, 0)
  laplaceProb = 1 / (first_two_wordCount + vocabSize)

  return laplaceProb

In [27]:
trigramProb, laplace_trigramProb = trigram(trigram_freqDist, bigram_freqDist, vocabDict_train, trainData.shape[0])
print("Trigram probability | non-smoothing for ('a', 'union', 'leader'):", trigramProb.get(('a', 'union', 'leader'), 0))
print("Trigram probability | Laplace smoothing for ('a', 'union', 'leader'):", laplace_trigramProb.get(('a', 'union', 'leader'), 0))

Trigram probability | non-smoothing for ('a', 'union', 'leader'): 0.16666666666666666
Trigram probability | Laplace smoothing for ('a', 'union', 'leader'): 7.51653638003608e-05


In [28]:
#Let's test my code with sample Ngrams -- Example Scratch code.
sentences = [[token if token in vocabDict_train else '<unk>' for token in s.split()] for s in trainData['sentence']]
for sent in sentences:
    sent.append('<STOP>')
trigram_list = [list(ngrams(sentence, 3, pad_left=True, left_pad_symbol='<START>')) for sentence in sentences]
trigrams = flatten(trigram_list)
tupled3 = [(tri[:-1], tri[-1]) for tri in trigrams]
cfd3 = nltk.ConditionalFreqDist(tupled3)
print (cfd3[('a', 'union')].freq('leader'))

0.16666666666666666


## **Load Test Data**

In [29]:
testData = loadData("1b_benchmark_test_tokens.txt")
testData = gen_ngramsList(testData, vocabDict_train)
testData.head(1)

Number of (rows, columns): (12105, 1)


Unnamed: 0,sentence,unigrams,bigrams,trigrams
0,BAGHDAD -- An Iraqi military commander on Mond...,"[(BAGHDAD,), (--,), (An,), (Iraqi,), (military...","[(<START>, BAGHDAD), (BAGHDAD, --), (--, An), ...","[(<START>, <START>, BAGHDAD), (<START>, BAGHDA..."


## **Implement two versions of the perplexity function.**

### **(1) Without any smoothing technique**

### **Unigram Perplexity - Non-smoothing**

In [30]:
def unigramPerplexity(data, unigramProb):
  '''Calculate perplexity for the unigram model on sentence level.'''

  #Store perplexity score of each sentence.
  sentence_perplexityList = []
  for index, row in data.iterrows():
    #Unigrams from the 'unigrams' column.
    unigrams = row['unigrams']

    #Accumulator for sum(log(prob)).
    sum_logProb = 0
    #Total number of tokens/ngrams per sentence.
    n = 0

    for unigram in unigrams:
      #Extract context word.
      word = unigram[0]
      #Retrieve the probability of the token/ngram from the unigramProb dictionary.
      prob = unigramProb.get(word)
      #Take natural log then add to sum_logProb accumulator.
      sum_logProb += math.log(prob)
      #Count ngrams within the sentence.
      n += 1

    #Average negative log-likelihood for the sentence aka. multiply with (-1/n).
    sentenceLikelihood = -sum_logProb / n
    #Inverse multiplication in logarithmic space.
    sentencePerplexity = math.exp(sentenceLikelihood)
    sentence_perplexityList.append(sentencePerplexity)

  #Average of the entire test corpus.
  overallPerplexity = sum(sentence_perplexityList) / len(sentence_perplexityList)

  return overallPerplexity

In [31]:
unigramPerplexity_nonSmoothing_train = unigramPerplexity(trainData, unigramProb)
print(f"Train corpus Unigram Perplexity - without smoothing: {unigramPerplexity_nonSmoothing_train}")

Train corpus Unigram Perplexity - without smoothing: 1080.3611555581683


In [32]:
unigramPerplexity_nonSmoothing_test = unigramPerplexity(testData, unigramProb)
print(f"Test corpus Unigram Perplexity - without smoothing: {unigramPerplexity_nonSmoothing_test}")

Test corpus Unigram Perplexity - without smoothing: 998.7255840250589


### **Bigram Perplexity - Non-smoothing**

In [34]:
def bigramPerplexity(data, bigramProb, sentenceCount = None, vocabDict = None):
  '''Calculate perplexity for the bigram model on sentence level.
  Note: If vocabDict is provided, apply Laplace smoothing to calculate unseen bigrams probability.'''

  #Store perplexity score of each sentence.
  sentence_perplexityList = []

  for index, row in data.iterrows():
    #Bigrams from the 'bigrams' column.
    bigrams = row['bigrams']

    #Accumulator for sum(log(prob)).
    sum_logProb = 0
    #Total number of tokens/ngrams per sentence.
    n = 0

    for bigram in bigrams:
      firstW, secondW = bigram
      #Retrieve the probability of the token/ngram from the bigramProb dictionary.
      prob = bigramProb.get((firstW, secondW))
      #If probability cannot be retrieved + vocabDict is provided.
      if prob is None and vocabDict:
        #Calculate Laplace smoothing for unseen bigrams.
        prob = unseenBigram(firstW, secondW, bigramProb, vocabDict, sentenceCount)

      #If the bigram is found, calculate natural log as usual.
      #Otherwise, set logProb to a very small number near 0.0
      logProb = math.log(prob) if prob and prob > 0 else 1e-10

      #Add to sum_logProb accumulator.
      sum_logProb += logProb
      #Count ngrams within the sentence.
      n += 1

    #Average negative log-likelihood for the sentence aka. multiply with (-1/n).
    sentenceLikelihood = -sum_logProb / n
    #Inverse multiplication in logarithmic space.
    sentencePerplexity = math.exp(sentenceLikelihood)
    sentence_perplexityList.append(sentencePerplexity)

  #Average of the entire test corpus.
  overallPerplexity = sum(sentence_perplexityList) / len(sentence_perplexityList)

  return overallPerplexity

In [35]:
bigramPerplexity_nonSmoothing_train = bigramPerplexity(trainData, bigramProb)
print(f"Train corpus Bigram Perplexity - without smoothing: {bigramPerplexity_nonSmoothing_train}")

Train corpus Bigram Perplexity - without smoothing: 77.98881234050911


In [36]:
bigramPerplexity_nonSmoothing_test = bigramPerplexity(testData, bigramProb)
print(f"Test corpus Bigram Perplexity - without smoothing: {bigramPerplexity_nonSmoothing_test}")

Test corpus Bigram Perplexity - without smoothing: 31.565190424964392


### **Trigram Perplexity - Non-smoothing**

In [37]:
def trigramPerplexity(data, trigramProb, sentenceCount = None, bigram_freqDist=None, vocabDict = None):
  '''Calculate perplexity for the trigram model on sentence level.
  Note: If vocabDict is provided, apply Laplace smoothing to calculate unseen bigrams probability.'''

  #Store perplexity score of each sentence.
  sentence_perplexityList = []
  for index, row in data.iterrows():
    #Unigrams from the 'unigrams' column.
    trigrams = row['trigrams']
    sum_logProb = 0
    n = 0

    for trigram in trigrams:
      firstW, secondW, thirdW = trigram
      #Retrieve the probability of the token/ngram from the unigramProb dictionary.
      prob = trigramProb.get((firstW, secondW, thirdW))
      #If the tuple cannot be found, set logProb to a very small number near 0.0
      if prob is None and vocabDict and bigram_freqDist:
        prob = unseenTrigram(firstW, secondW, thirdW, laplace_trigramProb, bigram_freqDist, vocabDict, sentenceCount)

      #If the bigram is found, calculate natural log as usual.
      #Otherwise, set logProb to a very small number near 0.0
      logProb = math.log(prob) if prob and prob > 0 else 1e-10

      #Take natural log then add to sum_logProb accumulator.
      sum_logProb += logProb
      #Count ngrams within the sentence.
      n += 1

    #Average negative log-likelihood for the sentence aka. multiply with (-1/n).
    sentenceLikelihood = -sum_logProb / n
    #Inverse multiplication in logarithmic space.
    sentencePerplexity = math.exp(sentenceLikelihood)
    sentence_perplexityList.append(sentencePerplexity)

  #Average of the entire test corpus.
  overallPerplexity = sum(sentence_perplexityList) / len(sentence_perplexityList)

  return overallPerplexity

In [38]:
trigramPerplexity_nonSmoothing_train = trigramPerplexity(trainData, trigramProb)
print(f"Train corpus Trigram Perplexity - without smoothing: {trigramPerplexity_nonSmoothing_train}")

Train corpus Trigram Perplexity - without smoothing: 8.548195941609501


In [39]:
trigramPerplexity_nonSmoothing_test = trigramPerplexity(testData, trigramProb)
print(f"Test corpus Trigram Perplexity - without smoothing: {trigramPerplexity_nonSmoothing_test}")

Test corpus Trigram Perplexity - without smoothing: 3.4849884922982457


### **(2) Perplexity laplace() with the Laplace smoothing**

Note: Apply Laplace smoothing on training data --> Specifically on dictionaries to store frequency aka. probability **--> Update made on functions unigram(), bigram(), and trigram().**

### **Unigram Perplexity - Laplace smoothing**

In [40]:
unigramPerplexity_Smoothing_train = unigramPerplexity(trainData, laplace_unigramProb)
print(f"Train corpus Unigram Perplexity - Laplace smoothing: {unigramPerplexity_Smoothing_train}")

Train corpus Unigram Perplexity - Laplace smoothing: 1077.9944246016585


In [41]:
unigramPerplexity_Smoothing_test = unigramPerplexity(testData, laplace_unigramProb)
print(f"Test corpus Unigram Perplexity - Laplace smoothing: {unigramPerplexity_Smoothing_test}")

Test corpus Unigram Perplexity - Laplace smoothing: 998.2470901836243


### **Bigram Perplexity - Laplace smoothing**

In [42]:
bigramPerplexity_Smoothing_train = bigramPerplexity(trainData, laplace_bigramProb, trainData.shape[0], vocabDict_train)
print(f"Train corpus Bigram Perplexity - Laplace smoothing: {bigramPerplexity_Smoothing_train}")

Train corpus Bigram Perplexity - Laplace smoothing: 1557.067020566824


In [43]:
bigramPerplexity_Smoothing_test = bigramPerplexity(testData, laplace_bigramProb, trainData.shape[0], vocabDict_train)
print(f"Test corpus Bigram Perplexity - Laplace smoothing: {bigramPerplexity_Smoothing_test}")

Test corpus Bigram Perplexity - Laplace smoothing: 1864.0722543564689


### **Trigram Perplexity - Laplace smoothing**

In [44]:
trigramPerplexity_Smoothing_train = trigramPerplexity(trainData, laplace_trigramProb, trainData.shape[0], bigram_freqDist, vocabDict_train)
print(f"Train corpus Trigram Perplexity - Laplace smoothing: {trigramPerplexity_Smoothing_train}")

Train corpus Trigram Perplexity - Laplace smoothing: 6420.969049255399


In [45]:
trigramPerplexity_Smoothing_test = trigramPerplexity(testData, laplace_trigramProb, trainData.shape[0], bigram_freqDist, vocabDict_train)
print(f"Test corpus Trigram Perplexity - Laplace smoothing: {trigramPerplexity_Smoothing_test}")

Test corpus Trigram Perplexity - Laplace smoothing: 10160.796925518838


### **Display outputs**

In [46]:
perplexityRes_train = {'Model': ['Unigram', 'Bigram', 'Trigram', 'Unigram', 'Bigram', 'Trigram'],
                       'Smoothing': ['Non-smoothing', 'Non-smoothing', 'Non-smoothing', 'Laplace', 'Laplace', 'Laplace'],
                       'Dataset': ['Train', 'Train', 'Train', 'Train', 'Train', 'Train'],
                       'Perplexity': [unigramPerplexity_nonSmoothing_train,
                                      bigramPerplexity_nonSmoothing_train,
                                      trigramPerplexity_nonSmoothing_train,
                                      unigramPerplexity_Smoothing_train,
                                      bigramPerplexity_Smoothing_train,
                                      trigramPerplexity_Smoothing_train]}
perplexityData_train = pd.DataFrame(perplexityRes_train)
perplexityData_train

Unnamed: 0,Model,Smoothing,Dataset,Perplexity
0,Unigram,Non-smoothing,Train,1080.361156
1,Bigram,Non-smoothing,Train,77.988812
2,Trigram,Non-smoothing,Train,8.548196
3,Unigram,Laplace,Train,1077.994425
4,Bigram,Laplace,Train,1557.067021
5,Trigram,Laplace,Train,6420.969049


In [47]:
perplexityRes_test = {'Model': ['Unigram', 'Bigram', 'Trigram', 'Unigram', 'Bigram', 'Trigram'],
                      'Smoothing': ['Non-smoothing', 'Non-smoothing', 'Non-smoothing', 'Laplace', 'Laplace', 'Laplace'],
                      'Dataset': ['Test', 'Test', 'Test', 'Test', 'Test', 'Test'],
                      'Perplexity': [unigramPerplexity_nonSmoothing_test,
                                     bigramPerplexity_nonSmoothing_test,
                                     trigramPerplexity_nonSmoothing_test,
                                     unigramPerplexity_Smoothing_test,
                                     bigramPerplexity_Smoothing_test,
                                     trigramPerplexity_Smoothing_test]}
perplexityData_test = pd.DataFrame(perplexityRes_test)
perplexityData_test

Unnamed: 0,Model,Smoothing,Dataset,Perplexity
0,Unigram,Non-smoothing,Test,998.725584
1,Bigram,Non-smoothing,Test,31.56519
2,Trigram,Non-smoothing,Test,3.484988
3,Unigram,Laplace,Test,998.24709
4,Bigram,Laplace,Test,1864.072254
5,Trigram,Laplace,Test,10160.796926


#**Text Generation from Ngram Language Models**

### **Load and Merge of the data**

In [40]:
trainData2 = loadData("1b_benchmark_train_tokens.txt")
testData2 = loadData("1b_benchmark_test_tokens.txt")
mergedData = pd.concat([trainData2, testData2], ignore_index=True)
print(f'Number of (rows, columns): {mergedData.shape}')
mergedData.head(1)

Number of (rows, columns): (61530, 1)
Number of (rows, columns): (12105, 1)
Number of (rows, columns): (73635, 1)


Unnamed: 0,sentence
0,Having a little flexibility on that issue woul...


### **Get merged vocabulary dictionary**

In [41]:
vocabDict_merge = vocabDict(mergedData)
print(f"Number of unique tokens - without '<START>': {len(vocabDict_merge)}")
print('----First five tokens----')
for token, count in list(vocabDict_merge.items())[:5]:
    print(f"{token}: {count}")

Number of unique tokens - without '<START>': 29479
----First five tokens----
Having: 56
a: 37496
little: 549
flexibility: 17
on: 13744


### **Create ngrams**

In [42]:
mergedData = gen_ngramsList(mergedData, vocabDict_merge)
mergedData.head(1)

Unnamed: 0,sentence,unigrams,bigrams,trigrams
0,Having a little flexibility on that issue woul...,"[(Having,), (a,), (little,), (flexibility,), (...","[(<START>, Having), (Having, a), (a, little), ...","[(<START>, <START>, Having), (<START>, Having,..."


### **Creates a conditional frequency distribution**

In [43]:
bigram_freqDist_merge = gen_ConditionalFreqDist(mergedData, 'bigrams')
print("Bigram ('Clinton', 'leading') frequency:", bigram_freqDist_merge[('Clinton',)]['leading'])

Bigram ('Clinton', 'leading') frequency: 1


In [44]:
trigram_freqDist_merge = gen_ConditionalFreqDist(mergedData, 'trigrams')
print("Trigram ('I', 'believe', 'he') frequency:", trigram_freqDist_merge[('I', 'believe')]['he'])

Trigram ('I', 'believe', 'he') frequency: 1


### **Unigram model**

In [45]:
unigramProb_merge, laplace_unigramProb_merge = unigram(mergedData)
print("Unigram probability | non-smoothing for 'Reihana':", unigramProb_merge.get('Reihana', 0))
print("Unigram probability | Laplace smoothing for 'Reihana':", laplace_unigramProb_merge.get('Reihana', 0))

Unigram probability | non-smoothing for 'Reihana': 1.5454413857869878e-06
Unigram probability | Laplace smoothing for 'Reihana': 2.02976446613135e-06


### **Bigram model**

In [46]:
bigramProb_merge, laplace_bigramProb_merge = bigram(bigram_freqDist_merge, vocabDict_merge, mergedData.shape[0])
print("Bigram probability | non-smoothing for ('a', 'little'):", bigramProb_merge.get(('a', 'little'), 0))
print("Bigram probability | Laplace smoothing for ('a', 'little'):", laplace_bigramProb_merge.get(('a', 'little'), 0))

Bigram probability | non-smoothing for ('a', 'little'): 0.005280563260081075
Bigram probability | Laplace smoothing for ('a', 'little'): 0.002971257932064203


### **Trigram model**

In [47]:
trigramProb_merge, laplace_trigramProb_merge = trigram(trigram_freqDist_merge, bigram_freqDist_merge, vocabDict_merge, mergedData.shape[0])
print("Trigram probability | non-smoothing for ('a', 'union', 'leader'):", trigramProb_merge.get(('a', 'union', 'leader'), 0))
print("Trigram probability | Laplace smoothing for ('a', 'union', 'leader'):", laplace_trigramProb_merge.get(('a', 'union', 'leader'), 0))

Trigram probability | non-smoothing for ('a', 'union', 'leader'): 0.14285714285714285
Trigram probability | Laplace smoothing for ('a', 'union', 'leader'): 6.782880010852608e-05


### **Speedy Choice**

In [48]:
def gen_greedyChoice(probDict, ngram='unigram', wordCount=100):
  '''Generate a sentence using the ngram model output.
  Logic: Greedy choice, next word is selected based on maximum probability with context word.
  Params: probDict: Dictionary containing ngram probabilities (unigram, bigram, or trigram).
          ngram: Type of ngram model - unigram, bigram, trigram.
          wordCount: Maximum word to generate.
  Note: probDict are result from Ngrams models.'''

  if ngram == 'unigram':
    text = []
  elif ngram == 'bigram':
    text = ['<START>']
  elif ngram == 'trigram':
    text = ['<START>', '<START>']

  while len(text) < wordCount:
    if ngram == 'unigram':
      #Note: unigramProb = {word1: prob}
      #Choose next word based on highest probability.
      nextWord = max(probDict, key=probDict.get)
      text.append(nextWord)

    elif ngram == 'bigram':
      #Note: bigramProb = {(word1, word2): prob}
      currentWord = text[-1]
      #Find the tuple with the maximum probability.
      nextWord = max(((secondW, prob) for (firstW, secondW), prob in probDict.items() if firstW == currentWord),
                 key=lambda x: x[1], default=(None, 0))[0]
      text.append(nextWord)

    elif ngram == 'trigram':
      #Note: trigramProb = {(word1, word2, word3): prob}
      firstW, secondW = text[-2], text[-1]
      #Find the tuple with the maximum probability.
      nextWord = max(((third, prob) for (first, second, third), prob in probDict.items() if first == firstW and second == secondW),
                 key=lambda x: x[1], default=(None, 0))[0]
      text.append(nextWord)

  return ' '.join(text)

In [49]:
unigramSentence_greedyChoice = gen_greedyChoice(unigramProb_merge, ngram='unigram')
bigramSentence_greedyChoice = gen_greedyChoice(bigramProb_merge, ngram='bigram')
trigramSentence_greedyChoice = gen_greedyChoice(trigramProb_merge, ngram='trigram')

print('Unigram Greedy Choice Sentence:')
print(f'{unigramSentence_greedyChoice}')
print('Bigram Greedy Choice Sentence:')
print(f'{bigramSentence_greedyChoice}')
print('Trigram Greedy choice Sentence:')
print(f'{trigramSentence_greedyChoice}')

Unigram Greedy Choice Sentence:
the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Bigram Greedy Choice Sentence:
<START> The <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> , the <unk> ,
Trigram Greedy choice Sentence:
<START> <START> The <unk> <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <unk> , <un

### **Random Sampling**

In [50]:
def gen_randomSampling(probDict, ngram='unigram', wordCount=100):
  '''Generate a sentence using the ngram model output.
  Logic: Random sampling, next word is randomly selected based on context word.
  Params: probDict: Dictionary containing ngram probabilities (unigram, bigram, or trigram).
          ngram: Type of ngram model - unigram, bigram, trigram.
          wordCount: Maximum word to generate.
  Note: probDict are result from Ngrams models.
        Early stopping apply, end sentence with <STOP>'''

  if ngram == 'unigram':
    text = []
  elif ngram == 'bigram':
    text = ['<START>']
  elif ngram == 'trigram':
    text = ['<START>', '<START>']

  while len(text) < wordCount:
    if ngram == 'unigram':
      #Create two tuples, one for word, one for corresponding probability.
      words, probs = zip(*probDict.items())
      #Select a random word.
      nextWord = random.choices(words, probs)[0]
      #Early stopping.
      if nextWord == '<STOP>':
        text.append(nextWord)
        break
      text.append(nextWord)

    elif ngram == 'bigram':
      currentWord = text[-1]
      #List of next words based on previous context word.
      next_wordList = [(pair[1], prob) for pair, prob in probDict.items() if pair[0] == currentWord]
      #If there is no word associated with context word.
      if not next_wordList:
        text.append('<STOP>')
        break
      #Select a random word.
      words, probs = zip(*next_wordList)
      nextWord = random.choices(words, probs)[0]
      if nextWord == '<STOP>':
        text.append(nextWord)
        break
      text.append(nextWord)

    elif ngram == 'trigram':
      firstW, secondW = text[-2], text[-1]
      #List of next words based on previous context words.
      next_wordList = [(pair[2], prob) for pair, prob in probDict.items() if pair[0] == firstW and pair[1] == secondW]
      #If there is no word associated with context words.
      if not next_wordList:
        text.append('<STOP>')
        break
      #Select a random word.
      words, probs = zip(*next_wordList)
      nextWord = random.choices(words, probs)[0]
      if nextWord == '<STOP>':
        text.append(nextWord)
        break
      text.append(nextWord)

  return ' '.join(text)

In [51]:
unigramSentence_randomSampling = gen_randomSampling(unigramProb_merge, ngram='unigram')
bigramSentence_randomSampling = gen_randomSampling(bigramProb_merge, ngram='bigram')
trigramSentence_randomSampling = gen_randomSampling(trigramProb_merge, ngram='trigram')
print('Unigram Random Sampling Sentence:')
print(f'{unigramSentence_randomSampling}')
print('Bigram Random Sampling Sentence:')
print(f'{bigramSentence_randomSampling}')
print('Trigram Random Sampling Sentence:')
print(f'{trigramSentence_randomSampling}')

Unigram Random Sampling Sentence:
be she Saturday . Web night ) , of -- he fourth at or surveys in him field Berlusconi the groups pillows top dark revised at continue the a starts a base ? arrested fortune the house the have wrong Katherine below Republicans No-one " 's , thrown with 've paint a Yeah to . him be would been he dozen - the 2010 <STOP>
Bigram Random Sampling Sentence:
<START> The Italian Sara <unk> to bubble , the real estate built on suspension of her late Wednesday in income countries . <STOP>
Trigram Random Sampling Sentence:
<START> <START> more alarm capacity than ever before to the black and Asian equity derivatives businesses ; and Western nations to decide , the two men met in <unk> Services as it was like before the start . <STOP>


### **Top-p nucleus sampling -- experiment with  different p**

In [52]:
def gen_topP_sampling(probDict, p=0.4, ngram='unigram', wordCount=100):
  '''Generate a sentence using the ngram model output.
  Logic: Top-p nucleus sampling, next word is randomly selected based on context word.
         Where next word pool is based on sum of their probabilities atlest equals to p.
  Params: probDict: Dictionary containing ngram probabilities (unigram, bigram, or trigram).
          p: Cumulative probability threshold for nucleus sampling.
          ngram: Type of ngram model - unigram, bigram, trigram.
          wordCount: Maximum word to generate.
  Note: probDict are result from Ngrams models.
        Early stopping apply, end sentence with <STOP>'''

  if ngram == 'unigram':
    text = []
  elif ngram == 'bigram':
    text = ['<START>']
  elif ngram == 'trigram':
    text = ['<START>', '<START>']

  while len(text) < wordCount:
    if ngram == 'unigram':
      #Sort by probability in descending order.
      sorted_probDict = sorted(probDict.items(), key=lambda x: x[1], reverse=True)

      #Get top tokens.
      cumulativeProb = 0
      next_wordList = []
      for word, prob in sorted_probDict:
        next_wordList.append((word, prob))
        cumulativeProb += prob
        if cumulativeProb >= p:
          break
      #Select a random word.
      words, probs = zip(*next_wordList)
      nextWord = random.choices(words, probs)[0]
      if nextWord == '<STOP>':
        text.append(nextWord)
        break
      text.append(nextWord)

    elif ngram == 'bigram':
      currentWord = text[-1]
      #List of all (next words, probability) based on context.
      next_wordList = [(pair[1], prob) for pair, prob in probDict.items() if pair[0] == currentWord]
      if not next_wordList:
        text.append('<STOP>')
        break
      #Sort by probability in descending order and get top tokens.
      next_wordList.sort(key=lambda x: x[1], reverse=True)
      cumulativeProb = 0
      final_next_wordList = []
      for word, prob in next_wordList:
        final_next_wordList.append((word, prob))
        cumulativeProb += prob
        if cumulativeProb >= p:
          break
      #Select a random word.
      words, probs = zip(*final_next_wordList)
      nextWord = random.choices(words, probs)[0]
      if nextWord == '<STOP>':
        text.append(nextWord)
        break
      text.append(nextWord)

    elif ngram == 'trigram':
      firstW, secondW = text[-2], text[-1]
      #List of all (next words, probability) based on context.
      next_wordList = [(pair[2], prob) for pair, prob in probDict.items() if pair[0] == firstW and pair[1] == secondW]
      if not next_wordList:
        text.append('<STOP>')
        break
      #Sort by probability in descending order and get top tokens.
      next_wordList.sort(key=lambda x: x[1], reverse=True)
      cumulativeProb = 0
      final_next_wordList = []
      for word, prob in next_wordList:
        final_next_wordList.append((word, prob))
        cumulativeProb += prob
        if cumulativeProb >= p:
          break
      #Select a random word.
      words, probs = zip(*final_next_wordList)
      nextWord = random.choices(words, probs)[0]
      if nextWord == '<STOP>':
        text.append(nextWord)
        break
      text.append(nextWord)

  return ' '.join(text)

In [53]:
unigramSentence_topP_sampling = gen_topP_sampling(unigramProb_merge, p=0.8, ngram='unigram')
bigramSentence_topP_sampling = gen_topP_sampling(bigramProb_merge, p=0.8, ngram='bigram')
trigramSentence_topP_sampling = gen_topP_sampling(trigramProb_merge, p=0.8, ngram='trigram')
print('Unigram Top-p Sampling Sentence:')
print(f'{unigramSentence_topP_sampling}')
print('Bigram Top-p Sampling Sentence:')
print(f'{bigramSentence_topP_sampling}')
print('Trigram Top-p Sampling Sentence:')
print(f'{trigramSentence_topP_sampling}')

Unigram Top-p Sampling Sentence:
technology 17 response <unk> of the for <unk> . tour problem property to Democratic . would lost other . the London Texas , in to the I from to unemployment . including last of air up is , new won about than . <unk> the talking not night who base " a ' given to , statement for girls his , has <STOP>
Bigram Top-p Sampling Sentence:
<START> ( US Open in their long-running pay for five points out of his brother in Afghanistan and learn what has approved by letter of the busy . <STOP>
Trigram Top-p Sampling Sentence:
<START> <START> City may have <unk> the sites for <unk> <unk> has said that for women self-confidence and weight were closely connected to the United States ' 2008 presidential campaign and asked for extra features , but don 't know how long a turkey shoot on Valentine 's Day , 2004 , he saw short track for a skinny girl to share it with a <unk> , a role model , with the gas prices . <STOP>


In [56]:
!apt-get -qq install -y pandoc > /dev/null 2>&1
!apt-get install texlive-xetex texlive-fonts-recommended texlive-plain-generic > /dev/null 2>&1
!jupyter nbconvert --to pdf "/content/drive/MyDrive/CSC583/CSC583 - Assignment 4.ipynb" > /dev/null 2>&1