In [None]:
from nltk.corpus import brown
from nltk import trigrams
from nltk import ConditionalFreqDist as CFD

#### Part 1:  Building an N-Gram Language Model Using the NLTK Brown Corpus

In this section, a statistical LM is created using the **Brown Corpus** from NLTK library. 
The data is preprocessed into all lower case words and adds the BOS and EOS for every sentence.

The LM is built using a **Trigram Model:**

$$
P(w_i \mid w_{i-2}, w_{i-1})
$$



In [None]:

tokens = []

for sentence in brown.sents():
    tokens.append("<s>") # Add BOS and EOS tags for each sentence
    tokens.append("<s>") # Trigrams have two BOS tags

    for words in sentence:
        tokens.append(words.lower()) # Reduce vocabulary size The != the
    tokens.append("</s>")

In [None]:
trigram = []

for word1, word2, word3 in trigrams(tokens):
    trigram.append(((word1,word2),word3))

trigram_table = CFD(trigram)

#### Part 2 - 3: Generate Predictions "I am ... and Your are ..."

By using the Trigram model, simple predictions are made following `I am` and `You are`

In [None]:
import random
def predict_next_word(start):
    start_words = start.lower().split()
    word1 = start_words[-2]
    word2 = start_words[-1]
    
    prediction = trigram_table[(word1,word2)]
    
    if len(prediction) == 0:
        return "</s>"
    
    choices = list(prediction.keys())
    weights = list(prediction.values())
    
    return random.choices(choices, weights=weights)[0]

In [175]:
def generate_sentence(start, max_len=15):
    words = start.lower().split()
    
    if len(words) == 1:
        words = ["<s>" , words[0]]
    elif len(words) == 0:
        words = ["<s>" , "<s>"]
    
    for _ in range(max_len):
        start_now = " ".join(words)
        next_word = predict_next_word(start_now)
        
        if next_word == "</s>":
            break
        
        words.append(next_word)
        
        if next_word == "</s>":
            break
    
    return " ".join(words)

In [None]:
import re

PUNCT_TO_REMOVE = {"''", "``", "--"}

def clean_sentence(sentence):
    words = sentence.split()


    words = [w for w in words if w not in PUNCT_TO_REMOVE]

    # 2. remove punctuation after prefix (e.g., "i am '' happy")
    if len(words) > 2 and words[2] in PUNCT_TO_REMOVE:
        words.pop(2)

    text = " ".join(words)

    # 3. remove space before punctuation
    text = re.sub(r"\s+([.,?!;:])", r"\1", text)

    # 4. remove duplicate punctuation
    text = re.sub(r"([.,?!])\1+", r"\1", text)

    return text

    

In [None]:
sentence = "You are"
you_are_sentences = []

for _ in range(10):
    
    you_are_sentences.append(generate_sentence(sentence))
    
for i in you_are_sentences:
    print(i)


you are a nonresident alien and outside ; ;
you are of equal parts of the city council and mr. sharpe first saw her except to
you are normally obligated to make a wish and turn a pool .
you are '' ? ?
you are the same american catholic higher education more available through membership in the misty marshlands and
you are turning over the babies were getting on the floor .
you are conscientious , selfless efforts deserve the nation's backlog of an army blanket , working ,
you are thinking about the only one who had the situation .
you are sitting indolently on the side .
you are thwarted if you insist that there would still be the statue because of the national


In [177]:
sentence = "I am"
i_am_sentences = []

for _ in range(10):
    i_am_sentences.append(generate_sentence(sentence))
    
for i in i_am_sentences:
    print(i)

i am simply too old '' .
i am highly privileged today to ask voters whether they would sleep , the imperative operations properly
i am for it is the last of the human replaces amplifier af in a notably condescending
i am concerned , the dramatist once needed an idea a chain of being known and worthy
i am happy '' bodybuilder -- looks as if this woman repeatedly complained she was not hardy
i am '' , `` for christ's sake , stay thou with me .
i am innocent '' .
i am for it to the u.s. army .
i am reliably given to slaves in canada , brazil , the first decision was a baby
i am .


#### Part 4: Calculate the probabilities of the predicted words

To compute the probabilities of a sentence, use the **Trigram Formula** 

$$\prod_{i=1}^{n} P(word_i|word_{i-2},word_{i-1}) = \prod_{i=1}^{n} \frac{count(word_{i-2}, word_{i-1}, word_i)}{count(word_{i-2}, word_{i-1})}$$

Note: Laplace smoothing (Add 1) was added, incase any raw counts have `zero`

*modified bigram formula with laplace smoothing:*
$$P_{LAP}(w_i | h) = \frac {Count(h,w_i) +1}{Count(h) + |V|}$$

In [178]:
V = len(set(tokens))

def trigram_probability(w1, w2, w3):
    numerator = trigram_table[(w1, w2)][w3] + 1    
    denominator = trigram_table[(w1, w2)].N() + V

    return numerator / denominator


In [151]:
def compute_sentence_probability(sentence):
    words = sentence.lower().split()

    words.insert(0, "<s>")
    words.insert(0, "<s>")
    words.append("</s>")

    probability = 1.0

    for i in range(2, len(words)):
        w1 = words[i-2]
        w2 = words[i-1]
        w3 = words[i]

        probability *= trigram_probability(w1, w2, w3)

    return probability


In [None]:
you_are_sentence_prob = []

for sentence in you_are_sentences: 
    P = compute_sentence_probability(sentence)
    print(f'{P:.2e}')
    print(sentence)
    print("--------------------------------")
    you_are_sentence_prob.append((sentence, P))
    


3.86e-29
you are taking it : </s>
--------------------------------
1.44e-49
you are not guided by logic or common sense . </s>
--------------------------------
4.54e-41
you are really an excess of two . </s>
--------------------------------
1.78e-73
you are budget-wise , when he was on a trip which could bring enormous pressure to correct
--------------------------------
2.19e-69
you are very tactful , do you try scaring that kid out there '' ? ? </s>
--------------------------------
3.26e-51
you are going to live in boxcars in the universe . </s>
--------------------------------
5.09e-19
you are . </s>
--------------------------------
1.26e-42
you are staring , open-mouthed and blushing . </s>
--------------------------------
3.40e-32
you are being pulled down . </s>
--------------------------------
5.07e-47
you are that production topped the sextet brilliantly . </s>
--------------------------------


#### Part 5: Computer Perplexity of the model

`Perplexity` will measure how suprised the model is by a given sentence. A `high` perplexity the model is very suprised, a `low` perplexity the model is confident in it's predictions

$$PP(sentence) = P(sentence)^{-1/n}$$


In [169]:
def compute_perplexity(sentence):
    p = compute_sentence_probability(sentence)

    words = sentence.lower().split()
    words.insert(0,"<s>")
    words.insert(0,"<s>")
    words.append("</s>")

    return p ** (-1.0/len(words))

In [171]:
# you are
for sentence in you_are_sentence_prob:
    print(sentence)
    print(compute_perplexity(sentence[0]))

('you are taking it : </s>', 3.8582585078121273e-29)
1435.712993420087
('you are not guided by logic or common sense . </s>', 1.4369841413167151e-49)
3081.437974432746
('you are really an excess of two . </s>', 4.53986732778117e-41)
2300.9811977737995
('you are budget-wise , when he was on a trip which could bring enormous pressure to correct', 1.77885714858967e-73)
4340.032146483798
("you are very tactful , do you try scaring that kid out there '' ? ? </s>", 2.1949054114459663e-69)
2709.7499037779553
('you are going to live in boxcars in the universe . </s>', 3.25884648235167e-51)
2321.644608851134
('you are . </s>', 5.088405903119621e-19)
410.5305344321058
('you are staring , open-mouthed and blushing . </s>', 1.263936642654907e-42)
3101.150843171676
('you are being pulled down . </s>', 3.4049293431741554e-32)
1402.1329315172488
('you are that production topped the sextet brilliantly . </s>', 5.073300170879983e-47)
3640.252388260482
