### Student details:
Student name: **Siddharth Prince**  
Student id: **23052058**

# Task 1

In [1]:
# Housekeeping - imports and utility functions
import pandas as pd

def processNGramData(ngramContent):
    data = {'Word': [], 'Count': []}
    for word in ngramContent:
        dataPoint = word.split('\t')
        data['Word'].append(dataPoint[0])
        data['Count'].append(int(dataPoint[1]))
    return pd.DataFrame(data)

# Method to search the bigram data for the seed word and get all corresponding matches.
def searchDataFrame(df, searchTerm: str, columnToSearch: str, matchWord: bool=True):
    searchRegex = f"^{searchTerm}$" if matchWord else searchTerm
    return df.loc[df[columnToSearch].str.contains(searchRegex, regex=True)]

In [2]:
# Loading data

# Downloading unigram data
!wget https://norvig.com/ngrams/count_1w.txt
with open("./count_1w.txt", "r") as unigramFile:
    unigramContent = unigramFile.read().splitlines()
unigrams_df = processNGramData(unigramContent)
totalUnigrams = len(unigrams_df)
# print(unigramContent[:100])

# Downloading bigram data
!wget https://norvig.com/ngrams/count_2w.txt
with open("./count_2w.txt", "r") as bigramFile:
    bigramContent = bigramFile.read().splitlines()
bigrams_df = processNGramData(bigramContent)
totalBigrams = len(bigrams_df)
# print(bigramContent[:100])

print(f'Number of unigrams:{totalUnigrams} Number of Bigrams: {totalBigrams}')
display(unigrams_df.head(100),bigrams_df.head(100))

--2023-10-30 07:14:37--  https://norvig.com/ngrams/count_1w.txt
Resolving norvig.com (norvig.com)... 158.106.138.13
Connecting to norvig.com (norvig.com)|158.106.138.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4956241 (4.7M) [text/plain]
Saving to: ‘count_1w.txt.2’


2023-10-30 07:14:39 (4.01 MB/s) - ‘count_1w.txt.2’ saved [4956241/4956241]

--2023-10-30 07:14:39--  https://norvig.com/ngrams/count_2w.txt
Resolving norvig.com (norvig.com)... 158.106.138.13
Connecting to norvig.com (norvig.com)|158.106.138.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5566017 (5.3M) [text/plain]
Saving to: ‘count_2w.txt.2’


2023-10-30 07:14:41 (4.15 MB/s) - ‘count_2w.txt.2’ saved [5566017/5566017]

Number of unigrams:333333 Number of Bigrams: 286358


Unnamed: 0,Word,Count
0,the,23135851162
1,of,13151942776
2,and,12997637966
3,to,12136980858
4,a,9081174698
...,...,...
95,like,520585287
96,service,519537222
97,x,508609523
98,than,502609275


Unnamed: 0,Word,Count
0,0Uplink verified,523545
1,0km to,116103
2,1000s of,939476
3,100s of,539389
4,100th anniversary,158621
...,...,...
95,24th of,327460
96,25th anniversary,261023
97,25th of,397735
98,26th of,271707


## Task 1. a
Finding the probability without 1 smoothing

### $P(w_{i}|w_{i-1}) = \frac{c(w_{i-1},w_{i})} {c(w_{i-1})}$  
```
where, c() -> count of,  
       w -> represents the word,  
       i -> current index  
```

In [4]:
def probability(sentence):
    wordList = sentence.split()
    sentenceProbability = 1
    print(f'sentence = {wordList}\n')

    for i in range(1, len(wordList)):
        # unigram count
        unigramCount_prev = searchDataFrame(unigrams_df, wordList[i-1], 'Word')['Count'].values[0]
        print(f'unigram count for "{wordList[i-1]}" = {unigramCount_prev}')

        #bigram count
        bigram = f'{wordList[i-1]} {wordList[i]}'
        bigramCount = searchDataFrame(bigrams_df, bigram, 'Word')['Count'].values[0]
        print(f'bigram count for "{bigram}" = {bigramCount}')

        # bigram probability
        bigramProbability = bigramCount/unigramCount_prev
        print(f'bigramProbability for {bigram} = {bigramProbability}')

        # Multiplying the probability to the total sentence probability
        if bigramCount != 0 and unigramCount_prev != 0:
            sentenceProbability *= bigramProbability
        else:
            print(f'The bigram probability for {bigram} is 0. Hence skipping the multiplication to sentence probability.')
        print('-----------------------------------------\n')
    print(f'Sentence probability for "{sentence}" = {sentenceProbability}\n')
    return sentenceProbability

In [5]:
probability("i love you") > probability('i hate you')

sentence = ['i', 'love', 'you']

unigram count for "i" = 3086225277
bigram count for "i love" = 3979312
bigramProbability for i love = 0.001289378332053626
-----------------------------------------

unigram count for "love" = 201063526
bigram count for "love you" = 5428714
bigramProbability for love you = 0.02699999402178991
-----------------------------------------

Sentence probability for "i love you" = 3.481320725727335e-05

sentence = ['i', 'hate', 'you']

unigram count for "i" = 3086225277
bigram count for "i hate" = 876611
bigramProbability for i hate = 0.0002840398614232463
-----------------------------------------

unigram count for "hate" = 21274675
bigram count for "hate you" = 504048
bigramProbability for hate you = 0.023692394830943365
-----------------------------------------

Sentence probability for "i hate you" = 6.7295845445659906e-06



True

## Task 1.b
Applying add-one smoothing. We need to add the total word types to the denominator for this to do this.  

### $P^{*}(w_{n}|w_{n-1}) = \frac{c(w_{n-1}, w_{n})+1} {c(w_{n-1})+V}$

In [6]:
def probabilityAddOne(sentence):
    wordList = sentence.split()
    sentenceProbability = 1
    print(f'sentence = {wordList}\n')

    for i in range(1, len(wordList)):
        # unigram count
        unigramCount_prev = searchDataFrame(unigrams_df, wordList[i-1], 'Word')['Count'].values[0]
        print(f'unigram count for "{wordList[i-1]}" = {unigramCount_prev}')

        #bigram count
        bigram = f'{wordList[i-1]} {wordList[i]}'
        bigramCount = searchDataFrame(bigrams_df, bigram, 'Word')['Count'].values[0]
        print(f'bigram count for "{bigram}" = {bigramCount}')

        # bigram probability
        bigramProbability = (bigramCount + 1)/(unigramCount_prev + totalUnigrams)
        print(f'bigramProbability for {bigram} = {bigramProbability}')

        # Multiplying the probability to the total sentence probability
        if bigramCount != 0 and unigramCount_prev != 0:
            sentenceProbability *= bigramProbability
        else:
            print(f'The bigram probability for {bigram} is 0. Hence skipping the multiplication to sentence probability.')
        print('-----------------------------------------\n')
    print(f'Sentence probability for "{sentence}" = {sentenceProbability}\n')
    return sentenceProbability

In [7]:
probabilityAddOne("i love you") > probabilityAddOne('i hate you')

sentence = ['i', 'love', 'you']

unigram count for "i" = 3086225277
bigram count for "i love" = 3979312
bigramProbability for i love = 0.001289239409583089
-----------------------------------------

unigram count for "love" = 201063526
bigram count for "love you" = 5428714
bigramProbability for love you = 0.026955311155076156
-----------------------------------------

Sentence probability for "i love you" = 3.475184943869884e-05

sentence = ['i', 'hate', 'you']

unigram count for "i" = 3086225277
bigram count for "i hate" = 876611
bigramProbability for i hate = 0.00028400951051436537
-----------------------------------------

unigram count for "hate" = 21274675
bigram count for "hate you" = 504048
bigramProbability for hate you = 0.023326953599795038
-----------------------------------------

Sentence probability for "i hate you" = 6.625076673669102e-06



True

## Task 2
Shannon visualisation

In [49]:
import random
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# splitting the bigrams into two separate columns to make processing for Shannon visualisation easier
bigrams_df[['word1', 'word2']] = bigrams_df['Word'].str.split(' ', expand=True)
bigrams_df.head(10)

Unnamed: 0,Word,Count,word1,word2
0,0Uplink verified,523545,0Uplink,verified
1,0km to,116103,0km,to
2,1000s of,939476,1000s,of
3,100s of,539389,100s,of
4,100th anniversary,158621,100th,anniversary
5,10am to,376141,10am,to
6,10th and,183715,10th,and
7,10th anniversary,242830,10th,anniversary
8,10th century,117755,10th,century
9,10th grade,174046,10th,grade


In [128]:
def shannonVisualization(seed="<S>", commonWordCount=10):
    seedStop = "</S>"
    # random.seed(69)
    generatedSentence = seed
    # Getting the top 10 common words from the unigram data to help terminate the sentence
    unigram_df_sorted = unigrams_df.sort_values('Count', ascending=False)
    commonWords = np.ravel(unigram_df_sorted.iloc[:commonWordCount, 0:1].values)
    print(f'Top 10 common words from the unigrams dataset: {commonWords}')

    iterCount = 0
    chosenOne = ""
    print("Shannon Visualisation:")
    # Stop sentence generation if </S> is encountered OR common word found, but not before minimum word count in sentence is >= 10
    while chosenOne != seedStop and (iterCount < 10 or (chosenOne in commonWords)):
        requiredBigrams_df = searchDataFrame(bigrams_df, seed, 'word1')
        if requiredBigrams_df.empty:
            break
        # Getting the sum of all the count values for the bigrams to compute all probabilities
        bigramCountSum = requiredBigrams_df['Count'].sum()
        
        # Computing the probability for each bigram in the data frame according to its frequency
        requiredBigrams_df['probability'] = requiredBigrams_df['Count']/bigramCountSum
        
        # Getting the intervals for all probabilities
        requiredBigrams_df = requiredBigrams_df.sort_values('probability', ascending=False)
        requiredBigrams_df['interval'] = requiredBigrams_df['probability'].cumsum()
        
        # Choosing a bigram randomly with the help of the above computed probability intervals
        randomProbability = random.random()
        chosenOne = requiredBigrams_df[requiredBigrams_df['interval'] >= randomProbability].iloc[0, 3]
        
        spaces = (len(generatedSentence)-len(seed))*' ' if iterCount > 0 else ''
        print(f'{spaces}{seed} {chosenOne}')
        generatedSentence += f' {chosenOne}'
        seed = chosenOne
        iterCount += 1
    
    return f'{generatedSentence} {seedStop}'

In [130]:
shannonSentence = shannonVisualization()
print(f'\nGenerated sentence from Shannon visualisation method is:\n{shannonSentence}')

Top 10 common words from the unigrams dataset: ['the' 'of' 'and' 'to' 'a' 'in' 'for' 'is' 'on' 'that']
Shannon Visualisation:
<S> phentermine
    phentermine on
                on the
                   the tragic
                       tragic death
                              death is
                                    is not
                                       not listed
                                           listed as
                                                  as the
                                                     the mixture

Generated sentence from Shannon visualisation method is:
<S> phentermine on the tragic death is not listed as the mixture </S>
