# Language Models N-Gram

In [18]:
import random
import nltk
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

In [32]:
#nltk.download()
nltk.download('reuters') 

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\Bhaarat\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [45]:
counts = Counter(reuters.words())
total_count = len(reuters.words())
 
# The most common 20 words are ...
print (counts.most_common(n=20))
# [(u'.', 94687), (u',', 72360), (u'the', 58251), (u'of', 35979), (u'to', 34035), (u'in', 26478), (u'said', 25224), (u'and', 25043), (u'a', 23492), (u'mln', 18037), (u'vs', 14120), (u'-', 13705), (u'for', 12785), (u'dlrs', 11730), (u"'", 11272), (u'The', 10968), (u'000', 10277), (u'1', 9977), (u's', 9298), (u'pct', 9093)]
 
# Compute the frequencies
for word in counts:
    counts[word] /= float(total_count)
 
# The frequencies should add up to 1
print (sum(counts.values()))  # 1.0
 
# Generate 100 words of language
text = []
 
for _ in range(100):
    r = random.random()
    accumulator = .0
 
    for word, freq in counts.items():
        accumulator += freq
 
        if accumulator >= r:
            text.append(word)
            break
 
print (' '.join(text))
#

[('.', 94687), (',', 72360), ('the', 58251), ('of', 35979), ('to', 34035), ('in', 26478), ('said', 25224), ('and', 25043), ('a', 23492), ('mln', 18037), ('vs', 14120), ('-', 13705), ('for', 12785), ('dlrs', 11730), ("'", 11272), ('The', 10968), ('000', 10277), ('1', 9977), ('s', 9298), ('pct', 9093)]
1.0000000000006808
295 SETS Barnett . earnings remain investor quarter according held mln exchange LABORATORIES maintain government based to to breached dumping its said Avg United " offer loss pilots . government Securities Ltd an broadly bank . 150 banks mainly will STAKE expected days billion paid its Oil the S Fund no strong program its in , " said on TAKEOVER organization ' , Republic , filing recent are said , EC 2 three Ark as , ROSE government . then the fully s RADIATION 316p owns INC East about ) substantial 1 O on an data a U costs 7


In [None]:
first_sentence = reuters.sents()[0]
print (first_sentence) # [u'ASIAN', u'EXPORTERS', u'FEAR', u'DAMAGE', u'FROM' ...
 
# Get the bigrams
print (list(bigrams(first_sentence))) # [(u'ASIAN', u'EXPORTERS'), (u'EXPORTERS', u'FEAR'), (u'FEAR', u'DAMAGE'), (u'DAMAGE', u'FROM'), ...
 
# Get the padded bigrams
print (list(bigrams(first_sentence, pad_left=True, pad_right=True))) # [(None, u'ASIAN'), (u'ASIAN', u'EXPORTERS'), (u'EXPORTERS', u'FEAR'), (u'FEAR', u'DAMAGE'), (u'DAMAGE', u'FROM'),
 
# Get the trigrams
print (list(trigrams(first_sentence))) # [(u'ASIAN', u'EXPORTERS', u'FEAR'), (u'EXPORTERS', u'FEAR', u'DAMAGE'), (u'FEAR', u'DAMAGE', u'FROM'), ...
 
# Get the padded trigrams
print (list(trigrams(first_sentence, pad_left=True, pad_right=True))) # [(None, None, u'ASIAN'), (None, u'ASIAN', u'EXPORTERS'), (u'ASIAN', u'EXPORTERS', u'FEAR'), (u'EXPORTERS', u'FEAR', u'DAMAGE'), (u'FEAR', u'DAMAGE', u'FROM') ...
 

In [None]:
model = defaultdict(lambda: defaultdict(lambda: 0))
 
for sentence in brown.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        model[(w1, w2)][w3] += 1
 
 
print (model["what", "the"]["economists"]) # "economists" follows "what the" 2 times
print (model["what", "the"]["nonexistingword"]) # 0 times
print (model[None, None]["The"]) # 8839 sentences start with "The"
 
# Let's transform the counts to probabilities
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count
 
print (model["what", "the"]["economists"]) # 0.0434782608696
print (model["what", "the"]["nonexistingword"]) # 0.0
print (model[None, None]["The"]) # 0.161543241465

In [36]:
import random
 
text = [None, None]
 
sentence_finished = False
 
while not sentence_finished:
    r = random.random()
    accumulator = .0
 
    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
 
        if accumulator >= r:
            text.append(word)
            break
 
    if text[-2:] == [None, None]:
        sentence_finished = True
 
print (' '.join([t for t in text if t]))
 

Who loved her and wrote down everything she overheard between her memory of Albert of Habsburg also worked on the history of Sweden in the enabling legislation .


N-gram based language models do have a few drawbacks:
1. The higher the N, the better is the model usually. But this leads to lots of computation overhead that requires
large computation power in terms of RAM
2. N-grams are a sparse representation of language. This is because we build the model based on the probability
of words co-occurring. It will give zero probability to all the words that are not present in the training corpus