In [109]:
import nltk
from nltk.corpus import brown
from nltk import trigrams
from nltk import ConditionalFreqDist as CFD

#### Part 1:  Building an N-Gram Language Model Using the NLTK Brown Corpus

In this section, a statistical LM is created using the **Brown Corpus** from NLTK library. 
The data is preprocessed into all lower case words and adds the BOS and EOS for every sentence.

The LM is built using a **Trigram Model:**

$$
P(w_i \mid w_{i-1}, w_{i-2})
$$



In [118]:

tokens = []

for sentence in brown.sents():
    tokens.append("<s>") # Add BOS and EOS for each sentence
    tokens.append("<s>")

    for words in sentence:
        tokens.append(words.lower()) # Reduce vocabulary size The != the
    tokens.append("</s>")

In [None]:
trigram = []

for word1,word2,word3 in trigrams(tokens):
    trigram.append(((word1,word2),word3))

trigram_table = CFD(trigram)

#### Part 2 - 3: Generate Predictions "I am ... and Your are ..."

By using the Trigram model, simple predictions are made following `I am` and `You are`

In [138]:
import random
def predict_next_word(start):
    start_words = start.lower().split()
    word1 = start_words[-2]
    word2 = start_words[-1]
    
    prediction = trigram_table[(word1,word2)]
    
    if len(prediction) == 0:
        return None
    
    choices = list(prediction.keys())
    weights = list(prediction.values())
    
    
    return random.choices(choices, weights=weights)[0]

In [165]:
def generate_sentence(start, max_len=15):
    words = start.lower().split()
    
    if len(words) == 1:
        words.insert(0, "<s>")
    elif len(words) == 0:
        words = ["<s>" , "<s>"]
    
    for _ in range(max_len):
        start_now = " ".join(words)
        next_word = predict_next_word(start_now)
        
        if next_word is None:
            break
        
        words.append(next_word)
        
        if next_word == "</s>":
            break
    
    return " ".join(words)

In [166]:
sentence = "You are"
you_are_sentences = []

for _ in range(10):
    you_are_sentences.append(generate_sentence(sentence))
    
for i in you_are_sentences:
    print(i)


you are taking it : </s>
you are not guided by logic or common sense . </s>
you are really an excess of two . </s>
you are budget-wise , when he was on a trip which could bring enormous pressure to correct
you are very tactful , do you try scaring that kid out there '' ? ? </s>
you are going to live in boxcars in the universe . </s>
you are . </s>
you are staring , open-mouthed and blushing . </s>
you are being pulled down . </s>
you are that production topped the sextet brilliantly . </s>


In [167]:
sentence = "I am"
i_am_sentences = []

for _ in range(10):
    i_am_sentences.append(generate_sentence(sentence))
    
for i in i_am_sentences:
    print(i)

i am a small piece in the late 1920's . </s>
i am '' , `` why'n hell didn't you '' ! ! </s>
i am innocent . </s>
i am selling the metal strip they had meant to insult him , regardless of how much
i am sure , i should like to do otherwise . </s>
i am deliberately raising the policy of support for public education in the rear steps to trim
i am a jew '' . </s>
i am interested only in their households , infant mortality , and bustlin' in any internal opposition
i am not making them partners in the fragrance and taste of it rarely snows here gets
i am a great variety of experience . </s>


#### Part 4: Calculate the probabilities of the predicted words

To compute the probabilities of a sentence, use the **Trigram Formula** 

$$\prod_{i=1}^{n} P(word_i|word_{i-2},word_{i-1}) = \prod_{i=1}^{n} \frac{count(word_{i-2}, word_{i-1}, word_i)}{count(word_{i-2}, word_{i-1})}$$

Note: Laplace smoothing (Add 1) was added, incase any raw counts have `zero`

*modified bigram formula with laplace smoothing:*
$$P_{LAP}(w_i | h) = \frac {Count(h,w_i) +1}{Count(h) + |V|}$$

In [150]:
V = len(set(tokens))
def trigram_probability(w1, w2, w3):
    numerator = trigram_table[(w1, w2)][w3] + 1    
    denominator = trigram_table[(w1, w2)].N() + V

    return numerator / denominator


In [151]:
def compute_sentence_probability(sentence):
    words = sentence.lower().split()

    words.insert(0, "<s>")
    words.insert(0, "<s>")
    words.append("</s>")

    probability = 1.0

    for i in range(2, len(words)):
        w1 = words[i-2]
        w2 = words[i-1]
        w3 = words[i]

        probability *= trigram_probability(w1, w2, w3)

    return probability


In [168]:
you_are_sentence_prob = []
for sentence in you_are_sentences: 
    P = compute_sentence_probability(sentence)
    print(f'{P:.2e}')
    print(sentence)
    print("--------------------------------")
    you_are_sentence_prob.append((sentence, P))
    


3.86e-29
you are taking it : </s>
--------------------------------
1.44e-49
you are not guided by logic or common sense . </s>
--------------------------------
4.54e-41
you are really an excess of two . </s>
--------------------------------
1.78e-73
you are budget-wise , when he was on a trip which could bring enormous pressure to correct
--------------------------------
2.19e-69
you are very tactful , do you try scaring that kid out there '' ? ? </s>
--------------------------------
3.26e-51
you are going to live in boxcars in the universe . </s>
--------------------------------
5.09e-19
you are . </s>
--------------------------------
1.26e-42
you are staring , open-mouthed and blushing . </s>
--------------------------------
3.40e-32
you are being pulled down . </s>
--------------------------------
5.07e-47
you are that production topped the sextet brilliantly . </s>
--------------------------------


#### Part 5: Computer Perplexity of the model

`Perplexity` will measure how suprised the model is by a given sentence. A `high` perplexity the model is very suprised, a `low` perplexity the model is confident in it's predictions

$$PP(sentence) = P(sentence)^{-1/n}$$


In [169]:
def compute_perplexity(sentence):
    p = compute_sentence_probability(sentence)

    words = sentence.lower().split()
    words.insert(0,"<s>")
    words.insert(0,"<s>")
    words.append("</s>")

    return p ** (-1.0/len(words))

In [171]:
# you are
for sentence in you_are_sentence_prob:
    print(sentence)
    print(compute_perplexity(sentence[0]))

('you are taking it : </s>', 3.8582585078121273e-29)
1435.712993420087
('you are not guided by logic or common sense . </s>', 1.4369841413167151e-49)
3081.437974432746
('you are really an excess of two . </s>', 4.53986732778117e-41)
2300.9811977737995
('you are budget-wise , when he was on a trip which could bring enormous pressure to correct', 1.77885714858967e-73)
4340.032146483798
("you are very tactful , do you try scaring that kid out there '' ? ? </s>", 2.1949054114459663e-69)
2709.7499037779553
('you are going to live in boxcars in the universe . </s>', 3.25884648235167e-51)
2321.644608851134
('you are . </s>', 5.088405903119621e-19)
410.5305344321058
('you are staring , open-mouthed and blushing . </s>', 1.263936642654907e-42)
3101.150843171676
('you are being pulled down . </s>', 3.4049293431741554e-32)
1402.1329315172488
('you are that production topped the sextet brilliantly . </s>', 5.073300170879983e-47)
3640.252388260482


In [105]:
# you are 
for word in predictions_your_are:
    sentence = prefix_you_are + " " + word
    perplexity = compute_perplexity(sentence)
    print(sentence, ": perplexity =", f"{perplexity:.2f}")

You are not : perplexity = 284.23
You are the : perplexity = 215.61
You are in : perplexity = 318.27
You are you : perplexity = 305.23
You are a : perplexity = 253.94
You are to : perplexity = 344.37
You are used : perplexity = 370.10
You are also : perplexity = 370.77
You are now : perplexity = 325.73
You are of : perplexity = 420.79
