# Language models with NLTK

NLTK affords ways to create NGrams models.

In [1]:
# suppose we have already segmented sentences
sentences = [
    "un ordianteur peut vous aider",
    "il veut vous aider",
    "il veut un ordinateur",
    "il peut nager"
]

# we will tokenize the sentences : see the previous chapter for more complex tokenizations
sentWords = []
for sentence in sentences:
    sentWords.append(sentence.split())

sentWords

[['un', 'ordianteur', 'peut', 'vous', 'aider'],
 ['il', 'veut', 'vous', 'aider'],
 ['il', 'veut', 'un', 'ordinateur'],
 ['il', 'peut', 'nager']]

In [2]:
## I. Preprocessing

In [3]:
from nltk.lm.preprocessing import pad_both_ends

sentWordsPad = []

for sent in sentWords:
    sentWordsPad.append(list(pad_both_ends(sent, 2)))

sentWordsPad

[['<s>', 'un', 'ordianteur', 'peut', 'vous', 'aider', '</s>'],
 ['<s>', 'il', 'veut', 'vous', 'aider', '</s>'],
 ['<s>', 'il', 'veut', 'un', 'ordinateur', '</s>'],
 ['<s>', 'il', 'peut', 'nager', '</s>']]

## II. NGrams

### II.1. Creating NGrams list

Here we will create two lists of NGrams: Unigrams and Bigrams.

In [4]:
from nltk.util import ngrams

text_bigrams = [ngrams(sent, 2) for sent in sentWordsPad]
text_unigrams = [ngrams(sent, 1) for sent in sentWordsPad]

list(text_bigrams[0])

[('<s>', 'un'),
 ('un', 'ordianteur'),
 ('ordianteur', 'peut'),
 ('peut', 'vous'),
 ('vous', 'aider'),
 ('aider', '</s>')]

### II.2. NGrams frequencies

In [5]:
from nltk.lm import NgramCounter

#Apparently, NgramCounter modifies the structure of input list
#So, you have to execute the previous cell every time you want to execute this one
#I tried copy.deepcopy, but didn't work
ngram_counts = NgramCounter(text_bigrams + text_unigrams)

print('words following "vous" with their frequencies')
print(sorted(ngram_counts[['vous']].items()))

# (unigram count, bigram count) = (3, 2)
ngram_counts['il'], ngram_counts[['vous']]['aider']

words following "vous" with their frequencies
[('aider', 1)]


(3, 1)

## III. Vocabulary

In [6]:
from nltk.lm.preprocessing import flatten

# One list of all words
wordsList = list(flatten(sentWordsPad))

wordsList

['<s>',
 'un',
 'ordianteur',
 'peut',
 'vous',
 'aider',
 '</s>',
 '<s>',
 'il',
 'veut',
 'vous',
 'aider',
 '</s>',
 '<s>',
 'il',
 'veut',
 'un',
 'ordinateur',
 '</s>',
 '<s>',
 'il',
 'peut',
 'nager',
 '</s>']

In [7]:
from nltk.lm import Vocabulary

#in here, we used "unk_cutoff=1" in order to keep all words
vocab = Vocabulary(wordsList, unk_cutoff=1)

# show all words
sorted(vocab.counts)

['</s>',
 '<s>',
 'aider',
 'il',
 'nager',
 'ordianteur',
 'ordinateur',
 'peut',
 'un',
 'veut',
 'vous']

In [8]:
# show a word's count
# The marker "<UNK>" is added for unknown words
vocab['alien'], vocab['vous'], vocab['<UNK>']

(0, 2, 1)

In [9]:
# lookup a word in the vocabulary
vocab.lookup('alien'), vocab.lookup('vous')

('<UNK>', 'vous')

In [10]:
# a vocabulary with a higher cutoff
vocab2 = Vocabulary(wordsList, unk_cutoff=2)

# "nager" exists in our text, but its frequency is less than 2
# so, it is considered as anknown
vocab2.lookup(['alien', 'il', 'vous', 'nager'])

('<UNK>', 'il', 'vous', '<UNK>')

In [11]:
vocab2['<UNK>']

2

## IV. Language models

### IV.1. Maximum Likelihood Estimator (MLE)

In here, we will use the previous components like a pipe; padding, ngrams and vocabulary. 
We will use bigrams as example.

In [12]:
from nltk.lm.preprocessing import padded_everygram_pipeline

# create a list for vocabulary and some NGrams of order (n) and (n-1)
train, vocab = padded_everygram_pipeline(2, sentWords)

#apparently, the model has an issue with iterators
#so, I will transform them to lists explicity
vocab_l = list(vocab)
train_l = []
for t in train:
    train_l.append(list(t))

train_l[0], vocab_l

([('<s>',),
  ('<s>', 'un'),
  ('un',),
  ('un', 'ordianteur'),
  ('ordianteur',),
  ('ordianteur', 'peut'),
  ('peut',),
  ('peut', 'vous'),
  ('vous',),
  ('vous', 'aider'),
  ('aider',),
  ('aider', '</s>'),
  ('</s>',)],
 ['<s>',
  'un',
  'ordianteur',
  'peut',
  'vous',
  'aider',
  '</s>',
  '<s>',
  'il',
  'veut',
  'vous',
  'aider',
  '</s>',
  '<s>',
  'il',
  'veut',
  'un',
  'ordinateur',
  '</s>',
  '<s>',
  'il',
  'peut',
  'nager',
  '</s>'])

In [13]:
from nltk.lm import MLE

lm = MLE(2)
lm.fit(train_l, vocab_l)

print(lm.vocab)
print(lm.counts)

lm.counts[['vous']]['aider']

<Vocabulary with cutoff=1 unk_label='<UNK>' and 12 items>
<NgramCounter with 2 ngram orders and 44 ngrams>


2

In [14]:
lm.score('vous')

0.08333333333333333

In [15]:
#P(aider|vous) = 2/2 = 1
#P(peut|il) = 1/3 = 0.3333333
lm.score('aider', ['vous']), lm.score('peut', ['il'])

(1.0, 0.3333333333333333)

In [16]:
# model testing
test = [('peut', 'vous'), ('vous', 'aider')]

# entropy and perplexity
lm.entropy(test), lm.perplexity(test)

(0.5, 1.4142135623730951)

In [17]:
# text generation
lm.generate(10, random_seed=3)

['<s>',
 'il',
 'veut',
 'vous',
 'aider',
 '</s>',
 '</s>',
 'veut',
 'un',
 'ordianteur']

### IV.2. Smoothed models

In [18]:
from nltk.lm import Laplace

lm_laplace = Laplace(2)
lm_laplace.fit(train_l, vocab_l)
#P(aider|vous) = (2+1)/(2+12) 
#P(peut|il) = (1+1)/(3+12)
lm_laplace.score('aider', ['vous']), lm_laplace.score('peut', ['il'])

(0.21428571428571427, 0.13333333333333333)

In [19]:
from nltk.lm import Lidstone

# gamma = 0.5
lm_lidstone = Lidstone(0.5, 2)
lm_lidstone.fit(train_l, vocab_l)
#P(aider|vous) = (2+0.5)/(2+12*0.5) = 2.5/8
#P(peut|il) = (1+0.5)/(3+12*0.5) = 1.5/9
lm_lidstone.score('aider', ['vous']), lm_lidstone.score('peut', ['il'])

(0.3125, 0.16666666666666666)

In [20]:
from nltk.lm import KneserNeyInterpolated

lm_keserney = KneserNeyInterpolated(2, discount=0.1)
lm_keserney.fit(train_l, vocab_l)

lm_keserney.score('aider', ['vous']), lm_keserney.score('peut', ['il'])

(0.9541666666666666, 0.3055555555555555)