In [1]:
# N-Gram is simply a sequence of N words, where N can be any positive integer. 
# NGram has various use cases like machine translation, spelling correction, word prediction, 

In [2]:
# This notebook describes N gram as a language model
# Language Model = models that predict upcoming words. Assigns a prob to each possible next word


In [9]:
import nltk
from nltk import trigrams
from nltk.corpus import reuters
from collections import defaultdict
# default dict is like a normal dictionary, but smarter — it automatically initializes missing keys with a default value.

In [14]:
nltk.download('reuters')

[nltk_data] Downloading package reuters to /Users/raeez/nltk_data...


True

In [19]:
words = nltk.word_tokenize(' '.join(reuters.words()))

In [21]:
tri_grams = list(trigrams(words)) #Create tri grams from tokenized words

In [37]:
tri_grams

[('ASIAN', 'EXPORTERS', 'FEAR'),
 ('EXPORTERS', 'FEAR', 'DAMAGE'),
 ('FEAR', 'DAMAGE', 'FROM'),
 ('DAMAGE', 'FROM', 'U'),
 ('FROM', 'U', '.'),
 ('U', '.', 'S'),
 ('.', 'S', '.-'),
 ('S', '.-', 'JAPAN'),
 ('.-', 'JAPAN', 'RIFT'),
 ('JAPAN', 'RIFT', 'Mounting'),
 ('RIFT', 'Mounting', 'trade'),
 ('Mounting', 'trade', 'friction'),
 ('trade', 'friction', 'between'),
 ('friction', 'between', 'the'),
 ('between', 'the', 'U'),
 ('the', 'U', '.'),
 ('U', '.', 'S'),
 ('.', 'S', '.'),
 ('S', '.', 'And'),
 ('.', 'And', 'Japan'),
 ('And', 'Japan', 'has'),
 ('Japan', 'has', 'raised'),
 ('has', 'raised', 'fears'),
 ('raised', 'fears', 'among'),
 ('fears', 'among', 'many'),
 ('among', 'many', 'of'),
 ('many', 'of', 'Asia'),
 ('of', 'Asia', "'"),
 ('Asia', "'", 's'),
 ("'", 's', 'exporting'),
 ('s', 'exporting', 'nations'),
 ('exporting', 'nations', 'that'),
 ('nations', 'that', 'the'),
 ('that', 'the', 'row'),
 ('the', 'row', 'could'),
 ('row', 'could', 'inflict'),
 ('could', 'inflict', 'far'),
 ('inf

In [38]:
model = defaultdict(lambda: defaultdict(lambda: 0))
# the outer defaultdict creates a new inner defaultdict every time you access a missing outer key.
# So, model[key] will itself be a defaultdict(lambda: 0).
# sp this gives a nested default dict
for w1,w2,w3 in tri_grams:
    model[(w1,w2)][w3] += 1 #How many times did the word w3 occur after the pair (w1, w2)
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values())) # for normalising, count of occurence of w1_w2
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count #gives conditional prob of occurence of w1_w2_w3


In [41]:
def predict_next_word(w1,w2):
    next_word_probs = model[w1,w2]
    if next_word_probs:
        return max(next_word_probs, key=next_word_probs.get)
    else:
        return "No prediction available"

print("Next Word:", predict_next_word('the','stock'))

Next Word: of


In [55]:
print("Next Word:", predict_next_word('between','the'))

Next Word: two


In [51]:
model

defaultdict(<function __main__.<lambda>()>,
            {('ASIAN',
              'EXPORTERS'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'FEAR': 1.0}),
             ('EXPORTERS',
              'FEAR'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'DAMAGE': 1.0}),
             ('FEAR',
              'DAMAGE'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'FROM': 1.0}),
             ('DAMAGE',
              'FROM'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'U': 0.5, 'SPILL': 0.5}),
             ('FROM',
              'U'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'.': 1.0}),
             ('U',
              '.'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'S': 0.8965354330708661,
                          'K': 0.098267716

In [44]:
model

defaultdict(<function __main__.<lambda>()>,
            {('ASIAN',
              'EXPORTERS'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'FEAR': 1.0}),
             ('EXPORTERS',
              'FEAR'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'DAMAGE': 1.0}),
             ('FEAR',
              'DAMAGE'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'FROM': 1.0}),
             ('DAMAGE',
              'FROM'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'U': 0.5, 'SPILL': 0.5}),
             ('FROM',
              'U'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'.': 1.0}),
             ('U',
              '.'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'S': 0.8965354330708661,
                          'K': 0.098267716

In [47]:
[t for t in tri_grams if t[0]=='U' and t[1]=='.']

[('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'K'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'K'),
 ('U', '.', 'S'),
 ('U', '.', 'S'),
 ('U', '.', 'K'),
 ('U', '.'