# Assignment 1

- Sampad Kumar Kar
- MCS202215

# 0. Imports

In [1]:
# to handle files
import json
import os
import glob

# import regular expressions
import re

# pre-processing imports
import numpy as np
import pandas as pd

# NLKT preprocessing imports
from nltk import download
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# for plotting
import matplotlib.pyplot as plt

# to view loop progress
from tqdm import tqdm

# to generate random no.s
import random

# to save dictionary
import pickle

# import priority queue
import heapq

In [2]:
# Download required NLTK data
download('stopwords')
download('wordnet')
download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sampadk04/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sampadk04/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sampadk04/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1. Corpus Creation

In [2]:
def extract_data(filename):
    '''
    Input: file path
    Return: title, abstract, content

    Description: extracts the title, abstract and body from the paper
    '''
    file = open(filename)
    content = json.load(file)
    file.close()

    # init empty strings to store data
    title, abstract, body_text = "", "", ""

    if 'title' in content['metadata']:
        title = content['metadata']['title']
    
    if 'abstract' in content:
        for abstract_data in content['abstract']:
            abstract += abstract_data['text']
    
    if 'body_text' in content:
        for text_snippet in content['body_text']:
            if 'text' in text_snippet:
                body_text += text_snippet['text']

    return title.lower(), abstract.lower(), body_text.lower()

In [3]:
folder_path = os.path.join(os.pardir, 'data', 'pdf_json')
folder_path

'../data/pdf_json'

In [4]:
json_file_paths = glob.glob(os.path.join(folder_path, '*.json'))
print("Total # of .json files:", len(json_file_paths))

Total # of .json files: 56528


In [5]:
# checking the 'extract_data function'
# ignore this cell
sample_file_path = os.path.join(os.pardir, 'data', 'sample_file.json')

title, abstract, content = extract_data(sample_file_path)
print("Title:", title)
print("")
print("Abstract:", abstract)
print("")
print("Content:", content[:1000])
print("")

Title: an extended outbreak of infectious peritonitis in a closed colony of european wildcats (fells silvestris)

Abstract: feline infectious peritonitis is a multisystemic disease of domestic and exotic cats caused by a coronavirus. an outbreak of feline infectious peritonitis was investigated in a closed colony of european wildcats (fells silvestris) at a zoological garden. over a six-year period, a putative fading kitten syndrome occurred in six of 11 litters born and severe lesions of infectious peritonitis occurred in five of the eight wildcats retained in the colony during this period. lesions were more acute in the early stages of the outbreak and included perivascular pyogranulomatons inflammation with exudative serositis. lesions occurred only in males. vascular lesions were common in the liver of all affected wildcats, serositis occurred in the abdominal and thoracic cavities in most cases and meningeal lesions were present in two cases. immunohistochemistry with specific ant

# 2. Preprocessing

In [5]:
# we define various functions for pre-processing

def remove_non_alpha(input_text):
    '''
    Removes non alphabets (including numbers, punctuations, special characters etc.)
    '''
    tokenizer = RegexpTokenizer("[A-Za-z_]+")
    new_text = tokenizer.tokenize(input_text)
    new_text = " ".join(new_text)

    return new_text


def remove_non_alphanumeric_pt(input_text):
    '''
    Removes non alphabets and non numerics and non (. ! ?) entities (special characters and other punctuations etc.)
    '''
    # Define a regular expression to match words, numbers, and full-stops
    pattern = re.compile(r'[a-zA-Z0-9_.?!]+')
    
    tokenizer = RegexpTokenizer(pattern)
    new_text = tokenizer.tokenize(input_text)
    new_text = " ".join(new_text)

    return new_text

def remove_html(input_text):
    '''
    Removes html links from the document
    '''
    # define regular exp to match html urls and replace them with ""
    new_text = re.sub(r'(http[s]?://|www\.)[^\s]+', "", input_text)
    return new_text

def word_lemmatizer(input_text):
    '''
    This performs word lemmatization, converts words to meaningful base form
    '''
    lemmatizer = WordNetLemmatizer()
    new_text = [lemmatizer.lemmatize(word) for word in input_text.split()]
    new_text = " ".join(new_text)

    return new_text

def remove_stopwords(input_text):
    '''
    This removes stopwords from the text, useful for problems like sentiment analysis
    
    Note: the input_text should contain only lowercase words
    '''
    # form a set stopwords
    stopwords_set = set(stopwords.words("english"))
    new_text = [w for w in input_text.split() if w not in stopwords_set]
    new_text = " ".join(new_text)

    return new_text

In [6]:
# combining all the functions to create a relevant corpus

def text_cleaner_1(input_text):
    '''
    Use this to deep clean the texts for problems like sentiment analysis
    '''
    new_text = remove_html(input_text)
    new_text = remove_non_alpha(new_text)
    new_text = word_lemmatizer(new_text)

    return new_text.strip()

In [7]:
# combining all the functions to create a relevant corpus

def text_cleaner_2(input_text, sentence_breaker=False):
    '''
    Use this to shallow clean the texts for n-gram models and sentence completion

    sentence_breaker: If this is true, this adds sentence breaker ' # ' before every sentence; (useful while building language models)
    '''
    new_text = remove_html(input_text)
    new_text = remove_non_alphanumeric_pt(new_text)
    new_text = new_text.strip()

    if sentence_breaker:
        sentence_list = []
        sentences = re.split("[.!?]", new_text)
        for sentence in sentences:
            sentence_list.append(" # " + sentence)
        # join the list of sentences into a single text
        new_text = ''.join(sentence_list)

    return new_text

## Saving the texts

In [8]:
# save the texts into a .txt file

def save_corpus(json_file_paths, txt_save_path, type):
    '''
    Saves text into .txt file

    type=1: uses text_cleaner_1, deep clean
    type=2: uses text_cleaner_2, shallow clean
    type=3: uses text_cleaner_3, shallow cleaning + sentence separation
    '''
    corpus = open(txt_save_path, 'w')

    for json_file_path in tqdm(json_file_paths):
        # extract title, abstract, body
        title, abstract, body = extract_data(json_file_path)
        
        # update after cleaning
        if type==1:
            # use text_cleaner_1 to deep clean everything
            title, abstract, body = text_cleaner_1(title), text_cleaner_1(abstract), text_cleaner_1(body)

            # combine the texts
            total_content = title + abstract + body + '\n'
            
            # write into the .txt file
            corpus.write(total_content)

        elif type==2:
            # use text_cleaner_1 for titles, to deep clean
            title = text_cleaner_1(title) + '. '
            # use text_cleaner_2 for abstract, body to shallow clean
            abstract = text_cleaner_2(abstract)
            body = text_cleaner_2(body)

            # combine the texts
            total_content = title + abstract + body + '\n'

            # write into the .txt file
            corpus.write(total_content)        
        elif type==3:
            # use text_cleaner_1 for titles, to deep clean
            title = ' # ' + text_cleaner_1(title)
            # use text_cleaner_3 for abstract, body to shallow clean
            abstract = text_cleaner_2(abstract, sentence_breaker=True)
            body = text_cleaner_2(body, sentence_breaker=True)

            # combine the texts
            total_content = title + abstract + body + '\n'

            # write into the .txt file
            corpus.write(total_content)
        else:
            print("Choose either type== 1/2/3")

    corpus.close()

In [9]:
my_corpus_folder_path = os.path.join(os.pardir, 'data', 'my_corpus')
my_corpus_folder_path

'../data/my_corpus'

In [10]:
# path of the corpus text files
corpus_1_path = os.path.join(my_corpus_folder_path, 'corpus_1.txt')
corpus_2_path = os.path.join(my_corpus_folder_path, 'corpus_2.txt')
corpus_3_path = os.path.join(my_corpus_folder_path, 'corpus_3.txt')

In [136]:
save_corpus(json_file_paths, corpus_1_path, type=1)

100%|██████████| 56528/56528 [12:42<00:00, 74.13it/s] 


In [16]:
save_corpus(json_file_paths, corpus_2_path, type=2)

100%|██████████| 56528/56528 [01:52<00:00, 503.40it/s]


In [33]:
save_corpus(json_file_paths, corpus_3_path, type=3)

100%|██████████| 56528/56528 [02:02<00:00, 460.50it/s]


# 3. Finding Vocabulary Count

In [17]:
def find_words_freq(corpus_path):
    corpus_vocab = set()
    corpus_vocab_freq = dict()
    paper_count = 0

    # initialize the tokenizer
    tokenizer = RegexpTokenizer(r'\w+|\S+')

    corpus = open(corpus_path, 'r')
    
    # read the papers one after another and process
    # Note: we consider sentence[:-1] to remove the '\n' character
    paper_data = corpus.readline()[:-1]

    # run the loop till we reach the end of the document
    while paper_data:
        # tokenize the paper
        paper_words = tokenizer.tokenize(paper_data)
        # process word by word
        for word in paper_words:
            if word not in corpus_vocab:
                corpus_vocab.add(word)
                corpus_vocab_freq[word] = 1
            else:
                corpus_vocab_freq[word] += 1
        
        # read the next paper_data
        paper_data = corpus.readline()[:-1]

        # increase paper_count
        paper_count += 1
        # print info to keep track of progress
        if paper_count%10000 == 0:
            print("-"*20)
            print(f"Document #: {paper_count}; Vocabulary Size: {len(corpus_vocab)}")

    corpus.close()

    return corpus_vocab, corpus_vocab_freq

In [18]:
corpus_2_vocab, corpus_2_freq = find_words_freq(corpus_2_path)

print("Corpus-2 Vocabulary Size:", len(corpus_2_vocab))

--------------------
Document #: 10000; Vocabulary Size: 429377
--------------------
Document #: 20000; Vocabulary Size: 660534
--------------------
Document #: 30000; Vocabulary Size: 846587
--------------------
Document #: 40000; Vocabulary Size: 1024412
--------------------
Document #: 50000; Vocabulary Size: 1170000
Corpus-2 Vocabulary Size: 1260153


# 4. Bigram and Trigram Language Models

## 4.1 `n-gram` Language Model

In [11]:
from collections import Counter
from nltk.tokenize import word_tokenize

def generate_ngrams(corpus_path, n=2, smoothing_factor=1):
    ngram_token_freq = Counter()
    corpus_vocab = set()
    corpus_vocab_size = 0
    paper_count = 0
    # to be used as denominator of laplacian smoothing
    token_count = 0
    
    corpus = open(corpus_path, 'r')
    # read the papers one after another and process
    # Note: we consider sentence[:-1] to remove the '\n' character
    paper_data = corpus.readline()[:-1]
    
    # run the loop till we reach the end of the document
    while paper_data:
        # tokenize the paper
        paper_words = word_tokenize(paper_data)
        
        # extract this as n-grams
        for i in range(len(paper_words)-n+1):
            ngram = tuple(paper_words[i:i+n])
            # update the ngram token freq
            ngram_token_freq[ngram] += 1
        
        # update the vocab
        for word in paper_words:
            if word not in corpus_vocab:
                corpus_vocab.add(word)
                corpus_vocab_size += 1
        
        # read the next paper_data
        paper_data = corpus.readline()[:-1]

        # increase paper_count
        paper_count += 1

        # print info to keep track of progress
        if paper_count%10000 == 0:
            print("-"*20)
            print(f"Document #: {paper_count}; Vocabulary Size: {len(corpus_vocab)}")
    
    corpus.close()

    # conduct laplacian smoothing
    for ngram in ngram_token_freq:
        ngram_token_freq[ngram] += smoothing_factor
        # update the token_count for smoothing denominator
        token_count += ngram_token_freq[ngram]

    # prob
    ngram_token_prob = ngram_token_freq.copy()
    
    # log(a/b) = log(a) - log(b)
    for ngram in ngram_token_freq:
        ngram_token_prob[ngram] = ngram_token_freq[ngram]/token_count

    return ngram_token_prob, corpus_vocab, corpus_vocab_size, token_count

## 4.1 Bigram Language Model

In [32]:
# corpus path
corpus_3_path = os.path.join(my_corpus_folder_path, 'corpus_3.txt')

# Generate the bi-gram language model
bigram_model, bigram_vocab, bigram_vocab_size, bigram_token_count = generate_ngrams(corpus_path=corpus_3_path, n=2, smoothing_factor=1)

--------------------
Document #: 10000; Vocabulary Size: 400690
--------------------
Document #: 20000; Vocabulary Size: 615733
--------------------
Document #: 30000; Vocabulary Size: 787940
--------------------
Document #: 40000; Vocabulary Size: 952844
--------------------
Document #: 50000; Vocabulary Size: 1087097


In [14]:
bigram_model_path = os.path.join('ngram_models', 'bigram_model.pickle')
bigram_vocab_path = os.path.join('ngram_models', 'bigram_vocab.pickle')

In [36]:
# to save the models

fh = open(bigram_model_path, "wb")
pickle.dump(bigram_model, fh)
fh.close()


fh = open(bigram_vocab_path, "wb")
pickle.dump(bigram_vocab, fh)
fh.close()

In [21]:
# to load the models

fh = open(bigram_model_path, 'rb')
bigram_model = pickle.load(fh)
fh.close()

fh = open(bigram_vocab_path, 'rb')
bigram_vocab = pickle.load(fh)
fh.close()

In [34]:
print("Bigram Vocabulary Size:", bigram_vocab_size)
print("Bigram Token Count:", bigram_token_count)

Bigram Vocabulary Size: 1170327
Bigram Token Count: 270756039


## 4.2 Trigram Language Model

In [16]:
# corpus path
corpus_3_path = os.path.join(my_corpus_folder_path, 'corpus_3.txt')

# Generate the bi-gram language model
trigram_model, trigram_vocab, trigram_vocab_size, trigram_token_count = generate_ngrams(corpus_path=corpus_3_path, n=3, smoothing_factor=1)

--------------------
Document #: 10000; Vocabulary Size: 400690
--------------------
Document #: 20000; Vocabulary Size: 615733
--------------------
Document #: 30000; Vocabulary Size: 787940
--------------------
Document #: 40000; Vocabulary Size: 952844
--------------------
Document #: 50000; Vocabulary Size: 1087097


In [12]:
trigram_model_path = os.path.join('ngram_models', 'trigram_model.pickle')
trigram_vocab_path = os.path.join('ngram_models', 'trigram_vocab.pickle')

In [17]:
# to save the models

fh = open(trigram_model_path, "wb")
pickle.dump(trigram_model, fh)
fh.close()

fh = open(trigram_vocab_path, "wb")
pickle.dump(trigram_vocab, fh)
fh.close()

In [13]:
# to load the models

fh = open(trigram_model_path, 'rb')
trigram_model = pickle.load(fh)
fh.close()

fh = open(trigram_vocab_path, 'rb')
trigram_vocab = pickle.load(fh)
fh.close()

In [18]:
print("Trigram Vocabulary Size:", trigram_vocab_size)
print("Trigram Token Count:", trigram_token_count)

Trigram Vocabulary Size: 1170327
Trigram Token Count: 335690076


# 5. Predicting Missing Texts

In [15]:
def find_next(ngram_model, ngram_vocab, word_tuple):
    best_prob = -np.inf
    best_word = ""
    for word in ngram_vocab:
        ngram = word_tuple + (word,)
        if ngram in ngram_model:
            word_found = True
            curr_prob = ngram_model[ngram]

            # update best word
            if best_prob < curr_prob:
                best_prob = curr_prob
                best_word = word
    
    # if no ngram could be formed
    while not best_word:
        # output random word
        best_word = random.sample(ngram_vocab, 1)[0]
        if not best_word.isalpha():
            best_word=""
    
    return best_word

In [16]:
from nltk.tokenize import word_tokenize

def fill_blank(sentence_before, sentence_after, n, smoothing_factor, ngram_model, ngram_vocab, ngram_token_count, keep_top_k=10):
    # convert the sentences to tuples
    sentence_tuple_before = tuple(word_tokenize(text_cleaner_2(sentence_before)))
    sentence_tuple_after = tuple(word_tokenize(text_cleaner_2(sentence_after)))
    
    # add the sentence breaker
    sentence_tuple_before = ('#',) + sentence_tuple_before
    sentence_tuple_after = sentence_tuple_after + ('#',)
    
    # resize sentence tuples before/after
    sentence_tuple_before = sentence_tuple_before[-(n-1):]
    sentence_tuple_after = sentence_tuple_after[:(n-1)]
    
    # default smoothing prob for non-existent ngram tuples
    # default_prob = smoothing_factor/ngram_token_count
    default_logprob = np.log(smoothing_factor) - np.log(ngram_token_count)
    
    best_logprob_sum = n*default_logprob
    best_word = ""

    k = keep_top_k
    top_k_queue = []
    
    for word in ngram_vocab:
        sentence_tuple = sentence_tuple_before + (word,) + sentence_tuple_after

        curr_logprob_sum = 0
        for i in range(n):
            curr_logprob = default_logprob
            curr_tuple = sentence_tuple[i:i+n]
            if curr_tuple in ngram_model:
                curr_logprob = np.log(ngram_model[curr_tuple])
            curr_logprob_sum += curr_logprob
        
        # update top-k for alphabets
        if word.isalpha() or word=='#':
            if len(top_k_queue) < k:
                heapq.heappush(top_k_queue, (curr_logprob_sum, word))
            else:
                smallest_logprob_sum = top_k_queue[0][0]
                if smallest_logprob_sum < curr_logprob_sum:
                    heapq.heappop(top_k_queue)
                    heapq.heappush(top_k_queue, (curr_logprob_sum, word))


        # update current
        if best_logprob_sum < curr_logprob_sum:
            best_logprob_sum = curr_logprob_sum
            best_word = word
    
    # if no ngram could be formed
    while not best_word:
        # output random word
        best_word = random.sample(ngram_vocab, 1)[0]
        if not best_word.isalpha() or best_word!='#':
            best_word=""
    

    top_k_queue = sorted(top_k_queue, reverse=True)
    print("Top " + str(k) + " words:")
    for (logprob_sum, word) in top_k_queue:
        print(word + "               || LogProbSum: " + str(logprob_sum))
    
    print("\nBest replacement:", best_word)
    
    return best_word

## 5.1 Prediction using `bigram`

In [123]:
# all houses were ----- ventilated
_ = fill_blank(
    sentence_before="all houses were",
    sentence_after="ventilated",
    n=2,
    smoothing_factor=1,
    ngram_model=bigram_model,
    ngram_vocab=bigram_vocab,
    ngram_token_count=bigram_token_count,
    keep_top_k=10
)

Top 10 words:
in               || LogProbSum: -24.178234921005483
the               || LogProbSum: -25.0418257824373
not               || LogProbSum: -25.17283388945549
mechanically               || LogProbSum: -25.772118821044984
a               || LogProbSum: -26.423424119183927
of               || LogProbSum: -26.50056719523603
well               || LogProbSum: -26.78515038997943
non               || LogProbSum: -26.9383355504725
then               || LogProbSum: -26.99519296801243
all               || LogProbSum: -27.118759216463545

Best replacement: in


In [124]:
# it aims to develop an integrated ------ to reach mmps exposed to malaria with prevention diagnosis and treatment
_ = fill_blank(
    sentence_before="it aims to develop an integrated",
    sentence_after="to reach mmps exposed to malaria with prevention diagnosis and treatment",
    n=2,
    smoothing_factor=1,
    ngram_model=bigram_model,
    ngram_vocab=bigram_vocab,
    ngram_token_count=bigram_token_count,
    keep_top_k=10
)

Top 10 words:
and               || LogProbSum: -21.97138030600042
#               || LogProbSum: -22.22456839272651
approach               || LogProbSum: -23.3935481010631
model               || LogProbSum: -24.782088564380153
response               || LogProbSum: -24.839986544580505
data               || LogProbSum: -25.31097929839438
as               || LogProbSum: -25.32580450790242
system               || LogProbSum: -25.332981023426925
in               || LogProbSum: -25.34902375675157
care               || LogProbSum: -25.527458426268524

Best replacement: and


In [125]:
# malaria with prevention diagnosis and treatment ------ by involving non-health
_ = fill_blank(
    sentence_before="malaria with prevention diagnosis and treatment",
    sentence_after="by involving non-health",
    n=2,
    smoothing_factor=1,
    ngram_model=bigram_model,
    ngram_vocab=bigram_vocab,
    ngram_token_count=bigram_token_count,
    keep_top_k=10
)

Top 10 words:
#               || LogProbSum: -18.55200510901936
and               || LogProbSum: -20.09305360213994
or               || LogProbSum: -22.06827677129898
is               || LogProbSum: -22.52638125949056
for               || LogProbSum: -22.626105864672997
of               || LogProbSum: -22.69998231919469
caused               || LogProbSum: -23.3427265291346
followed               || LogProbSum: -23.493684634711894
induced               || LogProbSum: -23.83545623113725
with               || LogProbSum: -24.04668101888194

Best replacement: #


In [126]:
# by involving non-health ----- stakeholders from provincial to community level
_ = fill_blank(
    sentence_before="by involving non health",
    sentence_after="stakeholders from provincial to community level",
    n=2,
    smoothing_factor=1,
    ngram_model=bigram_model,
    ngram_vocab=bigram_vocab,
    ngram_token_count=bigram_token_count,
    keep_top_k=10
)

Top 10 words:
and               || LogProbSum: -22.93228801500535
of               || LogProbSum: -24.000938853931473
#               || LogProbSum: -24.26434220905029
care               || LogProbSum: -25.6829149089419
the               || LogProbSum: -26.278684116810048
system               || LogProbSum: -26.984309826275307
with               || LogProbSum: -27.406874541463374
to               || LogProbSum: -27.566534877101603
for               || LogProbSum: -27.70441668196615
related               || LogProbSum: -28.02984924536389

Best replacement: and


## 5.2 Prediction using `trigram`

In [17]:
# all houses were ----- ventilated
_ = fill_blank(
    sentence_before="all houses were",
    sentence_after="ventilated",
    n=3,
    smoothing_factor=1,
    ngram_model=trigram_model,
    ngram_vocab=trigram_vocab,
    ngram_token_count=trigram_token_count,
    keep_top_k=10
)

Top 10 words:
mechanically               || LogProbSum: -49.05550787152691
not               || LogProbSum: -55.49389931655065
invasively               || LogProbSum: -55.59925983220849
well               || LogProbSum: -55.89936442465881
and               || LogProbSum: -55.89936442465882
then               || LogProbSum: -56.25603936859755
mechanical               || LogProbSum: -56.59251160521876
artificially               || LogProbSum: -56.697872120876596
still               || LogProbSum: -56.81565515653297
curtain               || LogProbSum: -56.81565515653297

Best replacement: mechanically


In [18]:
# it aims to develop an integrated ------ to reach mmps exposed to malaria with prevention diagnosis and treatment
_ = fill_blank(
    sentence_before="it aims to develop an integrated",
    sentence_after="to reach mmps exposed to malaria with prevention diagnosis and treatment",
    n=3,
    smoothing_factor=1,
    ngram_model=trigram_model,
    ngram_vocab=trigram_vocab,
    ngram_token_count=trigram_token_count,
    keep_top_k=10
)

Top 10 words:
approach               || LogProbSum: -46.994788632685186
and               || LogProbSum: -49.655585948936846
system               || LogProbSum: -50.319445937225
model               || LogProbSum: -50.775400445255556
way               || LogProbSum: -51.63106655531328
time               || LogProbSum: -51.91409095749108
strategy               || LogProbSum: -52.40285686319234
platform               || LogProbSum: -52.42120600186054
response               || LogProbSum: -52.74662840229516
sample               || LogProbSum: -52.8266711099687

Best replacement: approach


In [19]:
# malaria with prevention diagnosis and treatment ------ by involving non-health
_ = fill_blank(
    sentence_before="malaria with prevention diagnosis and treatment",
    sentence_after="by involving non-health",
    n=3,
    smoothing_factor=1,
    ngram_model=trigram_model,
    ngram_vocab=trigram_vocab,
    ngram_token_count=trigram_token_count,
    keep_top_k=10
)

Top 10 words:
#               || LogProbSum: -46.40009276083042
and               || LogProbSum: -49.37550847587029
of               || LogProbSum: -50.49488686228239
is               || LogProbSum: -50.94777167119635
options               || LogProbSum: -51.38960442347539
or               || LogProbSum: -51.90300027079692
guidelines               || LogProbSum: -52.05348122173522
strategies               || LogProbSum: -52.212988100763
groups               || LogProbSum: -52.299316184251495
with               || LogProbSum: -52.692561181024885

Best replacement: #


In [20]:
# by involving non-health ----- stakeholders from provincial to community level
_ = fill_blank(
    sentence_before="by involving non health",
    sentence_after="stakeholders from provincial to community level",
    n=3,
    smoothing_factor=1,
    ngram_model=trigram_model,
    ngram_vocab=trigram_vocab,
    ngram_token_count=trigram_token_count,
    keep_top_k=10
)

Top 10 words:
care               || LogProbSum: -51.75622969826728
related               || LogProbSum: -54.012294775626444
sector               || LogProbSum: -54.176597826917714
and               || LogProbSum: -54.618430579196755
workers               || LogProbSum: -54.963271065488485
professionals               || LogProbSum: -55.5628921880376
sectors               || LogProbSum: -55.63700016019133
key               || LogProbSum: -55.89936442465882
of               || LogProbSum: -56.00472494031665
consequences               || LogProbSum: -56.1870464971106

Best replacement: care


# 6. Perplexity Score

In [21]:
from nltk.tokenize import word_tokenize

def calculate_perplexity_score(sentence, n, ngram_model, ngram_token_count, smoothing_factor=1):
    # convert the sentence to tuple form
    sentence_tuple = tuple(word_tokenize(text_cleaner_2(sentence)))

    # add sentence breaker before and after the sentence
    sentence_tuple = ('#',) + sentence_tuple + ('#',)

    # default smoothing prob for non-existent ngram tuples
    # default_prob = smoothing_factor/ngram_token_count
    default_logprob = np.log(smoothing_factor) - np.log(ngram_token_count)

    # store the logprob some of ngrams of the sentence
    sentence_logprob_sum = 0

    for i in range(len(sentence_tuple)- n+1):
        curr_logprob = default_logprob
        curr_tuple = sentence_tuple[i:i+n]
        if curr_tuple in ngram_model:
            curr_logprob = np.log(ngram_model[curr_tuple])
        sentence_logprob_sum += curr_logprob
    
    log_perplexity = (-1/(len(sentence_tuple)-2))*sentence_logprob_sum
    perplexity = np.exp(log_perplexity)

    return perplexity, log_perplexity

In [23]:
sentence_list = [
    "it appears that the overall code stroke volume has decreased since the covid- pandemic",
    "half a century ago hypertension was not treatable",
    "sarahs tv is broadcasting an advert for private healthcare"
]

In [141]:
print("Bigram Scores:\n\n")

for sentence in sentence_list:
    bigram_perplexity, bigram_log_perplexity = calculate_perplexity_score(
                                                sentence=sentence,
                                                n=2,
                                                ngram_model=bigram_model,
                                                ngram_token_count=bigram_token_count,
                                                smoothing_factor=1
                                                )
    print("Sentence:", sentence)
    print("Perplexity:", bigram_perplexity)
    print("Log Perplexity:", bigram_log_perplexity)
    print("-"*50)

Bigram Scores:


Sentence: it appears that the overall code stroke volume has decreased since the covid- pandemic
Perplexity: 251869.6846089567
Log Perplexity: 12.436667108170905
--------------------------------------------------
Sentence: half a century ago hypertension was not treatable
Perplexity: 5833621.470202273
Log Perplexity: 15.579148543897563
--------------------------------------------------
Sentence: sarahs tv is broadcasting an advert for private healthcare
Perplexity: 103588138.32632214
Log Perplexity: 18.455933386314943
--------------------------------------------------


In [24]:
print("Trigram Scores:\n\n")

for sentence in sentence_list:
    trigram_perplexity, trigram_log_perplexity = calculate_perplexity_score(
                                sentence=sentence,
                                n=3,
                                ngram_model=trigram_model,
                                ngram_token_count=trigram_token_count,
                                smoothing_factor=1
                                )
    print("Sentence:", sentence)
    print("Perplexity:", trigram_perplexity)
    print("Log Perplexity:", trigram_log_perplexity)
    print("-"*50)

Trigram Scores:


Sentence: it appears that the overall code stroke volume has decreased since the covid- pandemic
Perplexity: 4243647.472057198
Log Perplexity: 15.26093371024641
--------------------------------------------------
Sentence: half a century ago hypertension was not treatable
Perplexity: 28429719.457511067
Log Perplexity: 17.16294561574561
--------------------------------------------------
Sentence: sarahs tv is broadcasting an advert for private healthcare
Perplexity: 173296818.0463108
Log Perplexity: 18.970516393561187
--------------------------------------------------
