## Ngram language model

## Bigram text generation

In [1]:
import nltk

In [2]:
sent = ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven',
...   'and', 'the', 'earth', '.']
list(nltk.bigrams(sent))

[('In', 'the'),
 ('the', 'beginning'),
 ('beginning', 'God'),
 ('God', 'created'),
 ('created', 'the'),
 ('the', 'heaven'),
 ('heaven', 'and'),
 ('and', 'the'),
 ('the', 'earth'),
 ('earth', '.')]

In [3]:
'''Generating Random Text: this program obtains all bigrams
             from the text of the book of Genesis, then constructs a
             conditional frequency distribution to record which
             words are most likely to follow a given word; e.g., after
             the word `living`:lx:, the most likely word is
             `creature`:lx:; the ``generate_model()`` function uses this
             data, and a seed word, to generate random text.'''

def generate_model(cfdist, word, num=15):
    for i in range(num):
        print(word)
        word = cfdist[word].max()

In [4]:
text = nltk.corpus.genesis.words('english-kjv.txt')
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)

In [5]:
cfd['living']

FreqDist({'creature': 7, 'thing': 4, 'substance': 2, 'soul': 1, '.': 1, ',': 1})

In [6]:
generate_model(cfd, 'living')

living
creature
that
he
said
,
and
the
land
of
the
land
of
the
land


## N-gram Language Modeling with NLTK

This section is based on the following tutoiral
https://www.kaggle.com/alvations/n-gram-language-model-with-nltk

In [7]:
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [8]:
# Assume we have the following data structure
text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]

In [9]:
# For bigram language model, you need to create bigrams first
# In this example, we apply the bigram method to the first list.
list(bigrams(text[0]))

[('a', 'b'), ('b', 'c')]

In [10]:
# In this example, we apply the ngram method, where n=3 
# to the second list
list(ngrams(text[1], n=3))

[('a', 'c', 'd'), ('c', 'd', 'c'), ('d', 'c', 'e'), ('c', 'e', 'f')]

In [11]:
# Insert padding symbols: the number of padding corresponds
# to the value of n -1: one start and end padding sybol for bigrams;
# two stard and end symbols for trigrams, etc.
from nltk.util import pad_sequence
list(pad_sequence(text[0],
                  pad_left=True, left_pad_symbol="<s>",
                  pad_right=True, right_pad_symbol="</s>",
                  n=2)) 

['<s>', 'a', 'b', 'c', '</s>']

In [12]:
# Apply the bigram function over the padded sequence (first list)
padded_sent = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", 
                                pad_right=True, right_pad_symbol="</s>", n=2))
list(ngrams(padded_sent, n=2))

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

In [13]:
list(pad_sequence(text[0],
                  pad_left=True, left_pad_symbol="<s>",
                  pad_right=True, right_pad_symbol="</s>",
                  n=3)) 

['<s>', '<s>', 'a', 'b', 'c', '</s>', '</s>']

In [14]:
# Apply the ngram (n=3) funciton over the padded sequence (list 2)
padded_sent = list(pad_sequence(text[0], pad_left=True, left_pad_symbol="<s>", 
                                pad_right=True, right_pad_symbol="</s>", n=3))
list(ngrams(padded_sent, n=3))

[('<s>', '<s>', 'a'),
 ('<s>', 'a', 'b'),
 ('a', 'b', 'c'),
 ('b', 'c', '</s>'),
 ('c', '</s>', '</s>')]

In [15]:
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(text[0], n=2))
list(everygrams(padded_bigrams, max_len=2))

[('<s>',),
 ('<s>', 'a'),
 ('a',),
 ('a', 'b'),
 ('b',),
 ('b', 'c'),
 ('c',),
 ('c', '</s>'),
 ('</s>',)]

In [16]:
# Our model will be trained and evaluated based on vocabulary
# The following flatten the data in our original data structure
# and makes sure that the padding symbols are part of vocabulary
from nltk.lm.preprocessing import flatten
list(flatten(pad_both_ends(sent, n=2) for sent in text))

['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

In [17]:
# The built in padded_everygram_pipeline 
# performs the above steps
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)

In [18]:
# Making visible the output of padded_everygram_pipeline
training_ngrams, padded_sentences = padded_everygram_pipeline(2, text)
for ngramlize_sent in training_ngrams:
    print(list(ngramlize_sent))
    print()
print('#############')
list(padded_sentences)

[('<s>',), ('<s>', 'a'), ('a',), ('a', 'b'), ('b',), ('b', 'c'), ('c',), ('c', '</s>'), ('</s>',)]

[('<s>',), ('<s>', 'a'), ('a',), ('a', 'c'), ('c',), ('c', 'd'), ('d',), ('d', 'c'), ('c',), ('c', 'e'), ('e',), ('e', 'f'), ('f',), ('f', '</s>'), ('</s>',)]

#############


['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

## Working with real data

In [19]:
# NLTK has built in tokenizers
from nltk import word_tokenize, sent_tokenize 

In [21]:
nltk.download('brown')

[nltk_data] Downloading package brown to /Users/Catherine/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [22]:
# Get the sentences from the Brown corpus
text = nltk.corpus.brown.sents(categories=['news', 'fiction'])
text[:2]

[['The',
  'Fulton',
  'County',
  'Grand',
  'Jury',
  'said',
  'Friday',
  'an',
  'investigation',
  'of',
  "Atlanta's",
  'recent',
  'primary',
  'election',
  'produced',
  '``',
  'no',
  'evidence',
  "''",
  'that',
  'any',
  'irregularities',
  'took',
  'place',
  '.'],
 ['The',
  'jury',
  'further',
  'said',
  'in',
  'term-end',
  'presentments',
  'that',
  'the',
  'City',
  'Executive',
  'Committee',
  ',',
  'which',
  'had',
  'over-all',
  'charge',
  'of',
  'the',
  'election',
  ',',
  '``',
  'deserves',
  'the',
  'praise',
  'and',
  'thanks',
  'of',
  'the',
  'City',
  'of',
  'Atlanta',
  "''",
  'for',
  'the',
  'manner',
  'in',
  'which',
  'the',
  'election',
  'was',
  'conducted',
  '.']]

In [23]:
# Convert vocabulary to lower case
tokenized_text = [list(map(lambda x:x.lower(), y)) for y in text]
tokenized_text[:2]

[['the',
  'fulton',
  'county',
  'grand',
  'jury',
  'said',
  'friday',
  'an',
  'investigation',
  'of',
  "atlanta's",
  'recent',
  'primary',
  'election',
  'produced',
  '``',
  'no',
  'evidence',
  "''",
  'that',
  'any',
  'irregularities',
  'took',
  'place',
  '.'],
 ['the',
  'jury',
  'further',
  'said',
  'in',
  'term-end',
  'presentments',
  'that',
  'the',
  'city',
  'executive',
  'committee',
  ',',
  'which',
  'had',
  'over-all',
  'charge',
  'of',
  'the',
  'election',
  ',',
  '``',
  'deserves',
  'the',
  'praise',
  'and',
  'thanks',
  'of',
  'the',
  'city',
  'of',
  'atlanta',
  "''",
  'for',
  'the',
  'manner',
  'in',
  'which',
  'the',
  'election',
  'was',
  'conducted',
  '.']]

In [24]:
# Prepare tokenized text for language modeling with trigrams (n=3)
n=3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

## The training phase

In [25]:
# Import the built in MLE
from nltk.lm import MLE
model = MLE(3)

In [26]:
# Initialization
len(model.vocab)

0

In [27]:
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 17724 items>


In [28]:
len(model.vocab)

17724

In [29]:
print(model.vocab.lookup(tokenized_text[0]))

('the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.')


In [30]:
# Application of the lookup function to unseen data
# yields UNK for vocabulary that has not been encountered
print(model.vocab.lookup('the fulton country blah.'.split()))

('the', 'fulton', 'country', '<UNK>')


## Using the model

In [31]:
print(model.counts)

<NgramCounter with 3 ngram orders and 586974 ngrams>


In [32]:
# Provides counts of the unigram jury
model.counts['jury']

46

In [33]:
# Counts for the bigram 'grand jury', i.e. Count('jury'|'grand')
model.counts[['said']]['that'] 

36

In [34]:
# Counts for trigrams 'county grand jury'
# i.e. Count('jury'|'county grand')
model.counts[['said', 'that']]['the'] 

10

In [35]:
# The probability of the word jury
model.score('jury') # P('language')

0.00022490588177773431

In [36]:
model.score('that', 'said'.split())  # P('jury'|'grand')

0.06

In [37]:
model.score('the', 'said that'.split())  # P('jury'|'grand')

0.2777777777777778

In [38]:
model.score("<UNK>") == model.score("blah")

True

In [39]:
model.logscore('the', 'said that'.split())

-1.84799690655495