# **N-Grams Language Model**

In [None]:
!pip install -U pip
!pip install -U dill
!pip install -U nltk==3.4

Collecting pip
  Using cached pip-22.2.2-py3-none-any.whl (2.0 MB)


ERROR: To modify pip, please run the following command:
C:\Users\admin\AppData\Local\Programs\Python\Python39\python.exe -m pip install -U pip

[notice] A new release of pip available: 22.2.1 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.2.1 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.2.1 -> 22.2.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# Importing the necessary libraries
from nltk.util import pad_sequence
from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams
from nltk.lm.preprocessing import pad_both_ends
from nltk.lm.preprocessing import flatten

In [None]:
# Dummy text data containing two sentences
text = [['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']] # List of list format 

In [None]:
list(bigrams(text[0])) # Bi-grams of first sentence 

[('a', 'b'), ('b', 'c')]

In [None]:
list(ngrams(text[1], n=3)) # Tri-grams of second sentence 

[('a', 'c', 'd'), ('c', 'd', 'c'), ('d', 'c', 'e'), ('c', 'e', 'f')]

## **Padding**
Done to let our model understand the start and end of a sentence

In [None]:
padded_sent = list(pad_sequence(text[0], # Text to be padded
                  pad_left=True, # Padding to the left
                  left_pad_symbol="<s>", 
                  pad_right=True, # Padding to the right 
                  right_pad_symbol="</s>", 
                  n=2)) # No. of grams 

In [None]:
padded_sent

['<s>', 'a', 'b', 'c', '</s>']

In [None]:
list(ngrams(padded_sent, n=2)) # 2-grams 

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

In [None]:
list(pad_sequence(text[0],
                  pad_left=True, left_pad_symbol="<s>",
                  pad_right=True, right_pad_symbol="</s>",
                  n=3)) # The n order of n-grams, if it's 2-grams, you pad once, 3-grams pad twice, etc. 

['<s>', '<s>', 'a', 'b', 'c', '</s>', '</s>']

In [None]:
list(pad_both_ends(text[0], n=2)) # Using pad_both_ends directly

['<s>', 'a', 'b', 'c', '</s>']

Combining the two parts discussed so far we get the following preparation steps for one sentence.

In [None]:
list(bigrams(pad_both_ends(text[0], n=2)))

[('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]

## **All-grams**

### **Now we know that for a bigram model we need both the bigram and unigram counts**
### $P(w_i|w_{i - 1}) = \frac{count(w_{i-1},w_i)}{count(w_{i - 1})}$
### $P(w_i|w_{i - 2},w_{i - 1}) = \frac{count(w_{i-2},w_{i - 1}, w_i)}{count(w_{i - 2}, w_{i-1})}$

In [None]:
# Introducing everygrams 
from nltk.util import everygrams
padded_bigrams = list(pad_both_ends(text[0], n=2))
all_grams = list(everygrams(padded_bigrams, max_len=2))
all_grams

[('<s>',),
 ('a',),
 ('b',),
 ('c',),
 ('</s>',),
 ('<s>', 'a'),
 ('a', 'b'),
 ('b', 'c'),
 ('c', '</s>')]

In [None]:
# Calculating the number of bi-grmas and unigrmas in first sentence [a, b, c]
count = {}
for i in all_grams:
  if len(i) == 1:
    if count.get("unigrams"): count["unigrams"] = count.get("unigrams") + 1
    else: count["unigrams"] = 1
  else:
    if count.get("bigrams"): count["bigrams"] = count.get("bigrams") + 1
    else: count["bigrams"] = 1

In [None]:
count

{'unigrams': 5, 'bigrams': 4}

## **Generating Vocabulary**

In [None]:
# Getting the padded sequence as a single list of words which is nothing but our vocabulary 
from nltk.lm.preprocessing import flatten
list(flatten(pad_both_ends(sent, n=2) for sent in text))

['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

In [None]:
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(2, text)

So as to avoid re-creating the text in memory, both `train` and `vocab` are lazy iterators. They are evaluated on demand at training time.

For the sake of understanding the output of `padded_everygram_pipeline`, we'll "materialize" the lazy iterators by casting them into a list.

In [None]:
#[['a', 'b', 'c'], ['a', 'c', 'd', 'c', 'e', 'f']]
training_ngrams, padded_sentences = padded_everygram_pipeline(2, text)
for ind, ngramlize_sent in enumerate(training_ngrams):
  print(f"n-grams for sentence {ind + 1}")
  print(list(ngramlize_sent))
# training_ngrams provides us with all_grams for every sentence in our text 

n-grams for sentence 1
[('<s>',), ('a',), ('b',), ('c',), ('</s>',), ('<s>', 'a'), ('a', 'b'), ('b', 'c'), ('c', '</s>')]
n-grams for sentence 2
[('<s>',), ('a',), ('c',), ('d',), ('c',), ('e',), ('f',), ('</s>',), ('<s>', 'a'), ('a', 'c'), ('c', 'd'), ('d', 'c'), ('c', 'e'), ('e', 'f'), ('f', '</s>')]


In [None]:
list(padded_sentences)
# padded_sentences is nothing but our vocabulary

['<s>', 'a', 'b', 'c', '</s>', '<s>', 'a', 'c', 'd', 'c', 'e', 'f', '</s>']

## **Lets go for some real data now**

In [None]:
# Downloading brown corpus
import nltk 
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [None]:
# Check all texts avaliable in "brown" Corpus
all_texts = nltk.corpus.brown.fileids()
print("Total Number of texts :", len(all_texts))
print(all_texts[:10])

Total Number of texts : 500
['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', 'ca08', 'ca09', 'ca10']


In [None]:
# Create list of all sentences present in "brown" coupus
full_brown = []

for text in all_texts:
    para = nltk.corpus.brown.sents(text)
    full_brown += [list(i) for i in para]

print("Total number of sentences in Brown corpus :", len(full_brown))
print()
for i in full_brown[:10]: print(i)

Total number of sentences in Brown corpus : 57340

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']
['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.']
['The', 'September-October', 'term', 'jury', 'had', 'been', 'charged', 'by', 'Fulton', 'Superior', 'Court', 'Judge', 'Durwood', 'Pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'Mayor-nominate', 'Ivan', 'Allen', 'Jr.', '.']
['``', 'Only',

## **Training our N-gram Model**

In [None]:
from nltk.lm import MLE
n = 3
model = MLE(n) # Lets train a 3-grams model, previously we set n=3

In [None]:
# Preprocess the tokenized text for 3-grams language modelling
train_data, padded_sents = padded_everygram_pipeline(n, full_brown)

In [None]:
%%time
model.fit(train_data, padded_sents)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 56060 items>
Wall time: 32.5 s


In [None]:
len(model.vocab)

56060

### **Vocab lookup feature**

In [None]:
print(model.vocab.lookup(full_brown[1]))

('The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.')


In [None]:
# If we lookup the vocab on unseen sentences not from the training data, 
# it automatically replace words not in the vocabulary with `<UNK>`.
print(model.vocab.lookup('language is never random lah .'.split()))

('language', 'is', 'never', 'random', '<UNK>', '.')


**Note:** For more sophisticated ngram models, take a look at [these objects from `nltk.lm.models`](https://github.com/nltk/nltk/blob/develop/nltk/lm/models.py):

 - `Lidstone`: Provides Lidstone-smoothed scores.
 - `Laplace`: Implements Laplace (add one) smoothing.
 - `InterpolatedLanguageModel`: Logic common to all interpolated language models (Chen & Goodman 1995).
 - `WittenBellInterpolated`: Interpolated version of Witten-Bell smoothing.

## **Lets use the model**

When it comes to ngram models the training boils down to counting up the ngrams from the training corpus.

In [None]:
print(model.counts)

<NgramCounter with 3 ngram orders and 3999636 ngrams>


In [None]:
model.counts['language'] # i.e. Count('language')

106

In [None]:
model.score('language') # P('language')

7.622872068070809e-05

In [None]:
model.score('is', 'language'.split())  # P('is'|'language')

0.05660377358490566

In [None]:
sent1 = 'My unscientific friend does not believe that human stature is measurable in terms of'

In [None]:
model.score('speed', 'terms of'.split())

0.012345679012345678

In [None]:
model.score('distance', 'terms of'.split())

0.0

In [None]:
model.score('time', 'terms of'.split())

0.012345679012345678

In [None]:
model.score('mass', 'terms of'.split())

0.0

In [None]:
model.score("<UNK>") == model.score("lah")

True

In [None]:
model.score("<UNK>") == model.score("leh")

True

In [None]:
model.score("<UNK>") == model.score("lor")

True

In [None]:
model.logscore('time', 'terms of'.split()) # Avoiding underflow using logscore

-6.339850002884625

## **Saving the model** 

The native Python's pickle may not save the lambda functions in the  model, so we can use the `dill` library in place of pickle to save and load the language model.


In [None]:
import dill as pickle 

with open('brown_3gram_model.pkl', 'wb') as fout:
    pickle.dump(model, fout)

In [None]:
with open('brown_3gram_model.pkl', 'rb') as fin:
    model_loaded = pickle.load(fin)

In [None]:
model_loaded.logscore('time', 'terms of'.split())

-6.339850002884625

# **POS Tagging using spacy**

In [1]:
import spacy

In [45]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("This is the second tutorial of NLP. I hope you are enjoying it.")

for token in doc:
    print(token," | ", token.pos_, " | ", spacy.explain(token.pos_))

This  |  PRON  |  pronoun
is  |  AUX  |  auxiliary
the  |  DET  |  determiner
second  |  ADJ  |  adjective
tutorial  |  NOUN  |  noun
of  |  ADP  |  adposition
NLP  |  PROPN  |  proper noun
.  |  PUNCT  |  punctuation
I  |  PRON  |  pronoun
hope  |  VERB  |  verb
you  |  PRON  |  pronoun
are  |  AUX  |  auxiliary
enjoying  |  VERB  |  verb
it  |  PRON  |  pronoun
.  |  PUNCT  |  punctuation


In [46]:
import spacy
from spacy import displacy
displacy.render(doc, style="dep", jupyter =True)

In [47]:
spacy.explain("amod")

'adjectival modifier'

## **Detailed tagging**

In [35]:
doc1 = nlp('He cut the cost yesterday.')
doc2 = nlp('He cuts his finger today')

In [36]:
for token in doc1:
    print(token," | ", token.pos_, " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))

He  |  PRON  |  pronoun  |  PRP  |  pronoun, personal
cut  |  VERB  |  verb  |  VBD  |  verb, past tense
the  |  DET  |  determiner  |  DT  |  determiner
cost  |  NOUN  |  noun  |  NN  |  noun, singular or mass
yesterday  |  NOUN  |  noun  |  NN  |  noun, singular or mass
.  |  PUNCT  |  punctuation  |  .  |  punctuation mark, sentence closer


In [37]:
for token in doc2:
    print(token," | ", token.pos_, " | ", spacy.explain(token.pos_), " | ", token.tag_, " | ", spacy.explain(token.tag_))

He  |  PRON  |  pronoun  |  PRP  |  pronoun, personal
cuts  |  VERB  |  verb  |  VBZ  |  verb, 3rd person singular present
his  |  PRON  |  pronoun  |  PRP$  |  pronoun, possessive
finger  |  NOUN  |  noun  |  NN  |  noun, singular or mass
today  |  NOUN  |  noun  |  NN  |  noun, singular or mass
