In [None]:
import nltk
from nltk.corpus import brown
from nltk import bigrams
from nltk import ConditionalFreqDist as CFD

#### Part 1:  Building an N-Gram Language Model Using the NLTK Brown Corpus

In this section, a statistical LM is created using the **Brown Corpus** from NLTK library. 
The data is preprocessed into all lower case words and adds the BOS and EOS for every sentence.

The LM is built using a **Bigram Model:**

$$
P(w_i \mid w_{i-1})
$$



In [88]:
tokens = []

for sentences in brown.sents():
    tokens.append("<s>") # Add BOS and EOS for each sentence

    for words in sentences:
        tokens.append(words.lower()) # Reduce vocabulary size The != the
    tokens.append("</s>")

In [87]:
bigram = list(bigrams(tokens))
bigram_table = CFD(bigram)

# Test : top ten words that follow 'me'
# print(bigram_table['me'].most_common(10))

#### Part 2 - 3: Generate Predictions "I am ... and Your are ..."

By using the Bigram model, simple predictions are made following `I am` and `You are`

In [None]:
def predict_next_words(sentence, n_predictions):
    predicted_words = bigram_table[sentence.split()[-1]].most_common()

    # Form sentences (I removed some puncuation for cleaner words)
    filered_words = []
    for predictions in predicted_words:
        if predictions[0].isalpha():
            filered_words.append(predictions[0])
        
        if len(filered_words) == n_predictions:
            break

    return filered_words

In [89]:
sentence = "I am"
i_am_x = predict_next_words(sentence, 10)

for count, result in enumerate(i_am_x):
    print(str(count + 1) + ": " + sentence + " " + result)


1: I am not
2: I am i
3: I am a
4: I am sure
5: I am an
6: I am innocent
7: I am told
8: I am very
9: I am getting
10: I am proud


In [81]:
sentence = "You are"
i_am_x = predict_next_words(sentence, 10)

for count, result in enumerate(i_am_x):
    print(str(count + 1) + ": " + sentence + " " + result)


1: You are not
2: You are the
3: You are in
4: You are you
5: You are a
6: You are to
7: You are used
8: You are also
9: You are now
10: You are of


#### Part 4: Calculate the probabilities of the predicted words

To compute the probabilities of a sentence, use the **Bigram Formula** 

$$\prod_{i=1}^{n} P(word_i|word_{i-1}) = \prod_{i=1}^{n} \frac{count(word_{i-1}, word_i)}{count(word_{i-1})}$$

Note: Laplace smoothing (Add 1) was added, incase any raw counts have `zero`

*modified bigram formula with laplace smoothing:*
$$P_{LAP}(w_i | w_{i-1}) = \frac {Count(w_{i-1},w_i) +1}{Count(w_{i-1, *}) + |V|}$$

In [90]:
V = len(set(tokens))
def bigram_probabilities(previous_word, current_word):
    count_numerator = bigram_table[previous_word][current_word]
    count_denomenator = bigram_table[previous_word].N()

    return (count_numerator + 1) / (count_denomenator + V)

def compute_sentence_probability(sentence):
    probability = 1.0

    words = sentence.split()
    words.insert(0, "<s>")
    words.append("</s>")

    for i in range(1, len(words)):
        probability *= bigram_probabilities(words[i-1].lower(),words[i].lower())

    return probability

In [91]:
prefix = "You are"
words = predict_next_words(prefix, 10)

for word in words:
    full_sentence = prefix + " " + word
    p = compute_sentence_probability(full_sentence)
    print(f"{full_sentence}: {p:.2e}")


You are not: 5.39e-13
You are the: 2.15e-12
You are in: 3.06e-13
You are you: 3.77e-13
You are a: 9.47e-13
You are to: 2.06e-13
You are used: 1.44e-13
You are also: 1.43e-13
You are now: 2.73e-13
You are of: 7.58e-14


#### Part 5: Computer Perplexity of the model

`Perplexity` will measure how suprised the model is by a given sentence. A `high` perplexity the model is very suprised, a `low` perplexity the model is confident in it's predictions

$$PP(sentence) = PP(sentence)^{-1/n}$$


In [92]:
def compute_perplexity(sentence):
    p = compute_sentence_probability(sentence)

    words = sentence.split()
    words.insert(0,"<s>")
    words.append("</s>")

    return p ** (-1.0/len(words))

In [93]:
prefix_you_are = "You are"
prefix_i_am = "I am"

predictions_your_are = predict_next_words(prefix_you_are, 10)
predictions_i_am = predict_next_words(prefix_i_am, 10)

In [94]:
# you are 
for word in predictions_your_are:
    sentence = prefix_you_are + " " + word
    perplexity = compute_perplexity(sentence)
    print(sentence, ": perplexity =", f"{perplexity:.2f}")

You are not : perplexity = 284.23
You are the : perplexity = 215.61
You are in : perplexity = 318.27
You are you : perplexity = 305.23
You are a : perplexity = 253.94
You are to : perplexity = 344.37
You are used : perplexity = 370.10
You are also : perplexity = 370.77
You are now : perplexity = 325.73
You are of : perplexity = 420.79
