# Importing supporting libraries

In [1]:
import pandas as pd
import re
import nltk
#nltk.download('punkt')
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from wordcloud import WordCloud
import matplotlib.pyplot as plt
#nltk.download('averaged_perceptron_tagger')
#nltk.download('treebank')
from nltk.util import ngrams
import random
import sys

# Opening the text file containing BIGGEST chapter and a RANDOM chapter

The biggest chapter is used to train the bi-gram model and a random chapter is used to play Shannon's game

In [2]:
file1 = open("bigram_modelling.txt", 'r', encoding = "utf-8")
file2 = open("testing.txt", 'r', encoding = "utf-8")

# Performing basic text preprocessing

In [3]:
chapter_text = ""
for line in file1:
    # Remove leading and trailing whitespace (including line breaks)
    cleaned_line = line.strip()
    if cleaned_line == "":
        continue
    chapter_text += cleaned_line + " "

In [4]:
author_pattern = r'PAULO COELHO '
chapter_text = re.sub(author_pattern, '', chapter_text)


book_pattern = r'THE ALCHEMIST '
'''
matches = re.findall(book_pattern, chapter_text)
num_matches = len(matches)
print("Number of matches:", num_matches)
'''
chapter_text = re.sub(book_pattern, '', chapter_text)


part_pattern = r'PART [A-Z]+ '
chapter_text = re.sub(part_pattern, '', chapter_text)

page_pattern = r'page [0-9]+'
chapter_text = re.sub(page_pattern, '', chapter_text)

specialchar_pattern = r'[^a-zA-Z0-9\s]'
chapter_text = re.sub(specialchar_pattern, '', chapter_text)

chapter_text = chapter_text.lower()

# Tokenisation

For the reasons of creating a bi-gram model, we don't remove the stopwords

In [5]:
chapterTokens = nltk.word_tokenize(chapter_text)
chapterTokens

['the',
 'boy',
 'had',
 'been',
 'working',
 'for',
 'the',
 'crystal',
 'merchant',
 'for',
 'almost',
 'a',
 'month',
 'and',
 'he',
 'could',
 'see',
 'that',
 'it',
 'wasnt',
 'exactly',
 'the',
 'kind',
 'of',
 'job',
 'that',
 'would',
 'make',
 'him',
 'happy',
 'the',
 'merchant',
 'spent',
 'the',
 'entire',
 'day',
 'mumbling',
 'behind',
 'the',
 'counter',
 'telling',
 'the',
 'boy',
 'to',
 'be',
 'careful',
 'with',
 'the',
 'pieces',
 'and',
 'not',
 'to',
 'break',
 'anything',
 'but',
 'he',
 'stayed',
 'with',
 'the',
 'job',
 'because',
 'the',
 'merchant',
 'although',
 'he',
 'was',
 'an',
 'old',
 'grouch',
 'treated',
 'him',
 'fairly',
 'the',
 'boy',
 'received',
 'a',
 'good',
 'commission',
 'for',
 'each',
 'piece',
 'he',
 'sold',
 'and',
 'had',
 'already',
 'been',
 'able',
 'to',
 'put',
 'some',
 'money',
 'aside',
 'that',
 'morning',
 'he',
 'had',
 'done',
 'some',
 'calculating',
 'if',
 'he',
 'continued',
 'to',
 'work',
 'every',
 'day',
 'as',


# Create Bi-gram

Creates Bi-gram probabilities of each Bi-gram

In [6]:
# Calculate the frequency distribution of bi-grams
tokenBigrams = nltk.ngrams(chapterTokens, 2)  # 'n' in n-grams
bigrams = list(tokenBigrams)

freq_dist_bigrams = FreqDist(bigrams)


distinctTokens = list(set(sorted(chapterTokens)))
tokenDICT = dict(nltk.FreqDist(chapterTokens))
bigramDICT = dict(freq_dist_bigrams)


n = len(distinctTokens) # Get the number of distinct tokens
bigramProbabilityDistribution = [[]*n for i in range(n)] 


def findBigram(bigram : tuple, bigramDICT: dict):
    try:
        return bigramDICT[bigram]
    except:
        return 0

# Iterate through the distinct tokens and calculate the bigram probabilities
for i in range(n):
    countOfPreviousWord = tokenDICT[distinctTokens[i]]
    for j in range(n):
        bigram = (str(distinctTokens[i]), str(distinctTokens[j]))
        countOfTheBigram = findBigram(bigram, bigramDICT)

        # Calculate the bigram probability
        bigramProbabilityDistribution[i].append(float("{:.3f}".format(countOfTheBigram/countOfPreviousWord)))

bigramProbabilityDistribution


# Shannon's game

In [7]:
test_chapter_text = ""
for line in file2:
    cleaned_line = line.strip()
    if cleaned_line == "":
        continue
    test_chapter_text += cleaned_line + " "

author_pattern = r'PAULO COELHO '
test_chapter_text = re.sub(author_pattern, '', test_chapter_text)

book_pattern = r'THE ALCHEMIST '
test_chapter_text = re.sub(book_pattern, '', test_chapter_text)

part_pattern = r'PART [A-Z]+ '
test_chapter_text = re.sub(part_pattern, '', test_chapter_text)

page_pattern = r'page [0-9]+'
test_chapter_text = re.sub(page_pattern, '', test_chapter_text)

specialchar_pattern = r'[^a-zA-Z0-9\s]'
test_chapter_text = re.sub(specialchar_pattern, '', test_chapter_text)

test_chapter_text = test_chapter_text.lower()

testChapterTokens = nltk.word_tokenize(test_chapter_text)

num_blanks = 300

blankPositions = random.sample(range(1, len(testChapterTokens)), num_blanks)
blankPositions.sort()
originalTokens = []
for i in blankPositions:
    originalTokens.append(testChapterTokens[i])
    testChapterTokens[i] = "_"

ogTokenIndex = -1
correctlyFilledBlanks = num_blanks

for i in range(len(testChapterTokens)):
    if(testChapterTokens[i] == "_"):
        ogTokenIndex += 1
        try:
            previousWordIndex = i - 1
            maxProb = -sys.maxsize + 1
            list_index = distinctTokens.index(testChapterTokens[previousWordIndex])
            for prob in bigramProbabilityDistribution[list_index]:
                maxProb = max(prob, maxProb)
            indexOfTokenToBeReplaced = bigramProbabilityDistribution[list_index].index(maxProb)
            if(distinctTokens[indexOfTokenToBeReplaced] != originalTokens[ogTokenIndex]):
                correctlyFilledBlanks -= 1
        except:
            correctlyFilledBlanks -= 1

accuracy = (correctlyFilledBlanks / num_blanks) * 100
print(accuracy)

11.666666666666666
