# <center> PROBABILISTIC LANGUAGE MODELS </center>

#### Main idea is to compute the probability of a sentence or sequence of words

# Sample Text

In [1]:
text = 'I have been driving my car all night long.\
 I do not know when exactly did I miss my exit.\
 But, I am QUITE sure I was paying attention to the road.'
text

'I have been driving my car all night long. I do not know when exactly did I miss my exit. But, I am QUITE sure I was paying attention to the road.'

# Bigrams

In [2]:
from nltk import bigrams
for bi in bigrams(text.split()):
    print(bi)


('I', 'have')
('have', 'been')
('been', 'driving')
('driving', 'my')
('my', 'car')
('car', 'all')
('all', 'night')
('night', 'long.')
('long.', 'I')
('I', 'do')
('do', 'not')
('not', 'know')
('know', 'when')
('when', 'exactly')
('exactly', 'did')
('did', 'I')
('I', 'miss')
('miss', 'my')
('my', 'exit.')
('exit.', 'But,')
('But,', 'I')
('I', 'am')
('am', 'QUITE')
('QUITE', 'sure')
('sure', 'I')
('I', 'was')
('was', 'paying')
('paying', 'attention')
('attention', 'to')
('to', 'the')
('the', 'road.')


# Trigrams

In [3]:
from nltk import trigrams
for tri in trigrams(text.split()):
    print(tri)

('I', 'have', 'been')
('have', 'been', 'driving')
('been', 'driving', 'my')
('driving', 'my', 'car')
('my', 'car', 'all')
('car', 'all', 'night')
('all', 'night', 'long.')
('night', 'long.', 'I')
('long.', 'I', 'do')
('I', 'do', 'not')
('do', 'not', 'know')
('not', 'know', 'when')
('know', 'when', 'exactly')
('when', 'exactly', 'did')
('exactly', 'did', 'I')
('did', 'I', 'miss')
('I', 'miss', 'my')
('miss', 'my', 'exit.')
('my', 'exit.', 'But,')
('exit.', 'But,', 'I')
('But,', 'I', 'am')
('I', 'am', 'QUITE')
('am', 'QUITE', 'sure')
('QUITE', 'sure', 'I')
('sure', 'I', 'was')
('I', 'was', 'paying')
('was', 'paying', 'attention')
('paying', 'attention', 'to')
('attention', 'to', 'the')
('to', 'the', 'road.')


# Building a trigram Probability model

In [4]:
from nltk import word_tokenize, sent_tokenize 
# Preprocess the tokenized text for 3-grams language modelling
tokenized_text = [list(map(str.lower, word_tokenize(sent))) for sent in sent_tokenize(text)]
tokenized_text

[['i', 'have', 'been', 'driving', 'my', 'car', 'all', 'night', 'long', '.'],
 ['i',
  'do',
  'not',
  'know',
  'when',
  'exactly',
  'did',
  'i',
  'miss',
  'my',
  'exit',
  '.'],
 ['but',
  ',',
  'i',
  'am',
  'quite',
  'sure',
  'i',
  'was',
  'paying',
  'attention',
  'to',
  'the',
  'road',
  '.']]

# Using the built-in Maximum Likelihood Estimation

In [5]:
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

n = 3
train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

model = MLE(n) # Lets train a 3-grams maximum likelihood estimation model.
model.fit(train_data, padded_sents)

# Getting counts

In [6]:
print(model.counts['i']) # i.e. Count('i')
print(model.counts[['have']]['been']) # i.e. Count('been'|'have')
print(model.counts[['paying', 'attention']]['to']) # i.e. Count('to'|'paying attention')

5
1
1


# Getting Probabilities

In [7]:
print(model.score('was','i'.split())) # i.e. Count('i')
print(model.score('to', 'paying attention'.split()))

0.2
1.0


P(was|I) = count(I|was) / count(I) = 1 / 5 = 0.2

# Part of Speech Tagging (POS tagging)

part-of-speech tags are used for grammar analysis and word sense disambiguation.
For example, the word duck could refer to a bird, or it could be a verb indicating a downward
motion. Computers cannot know the difference without additional information, such
as part-of-speech tags.

### Using the default tagger to tag and untag

In [8]:
from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
tagger.tag(['Hello', 'World','is','quite','literally','in','every','CS','Tutorial'])

[('Hello', 'NN'),
 ('World', 'NN'),
 ('is', 'NN'),
 ('quite', 'NN'),
 ('literally', 'NN'),
 ('in', 'NN'),
 ('every', 'NN'),
 ('CS', 'NN'),
 ('Tutorial', 'NN')]

In [9]:
from nltk.tag import untag
untag([('Hello', 'NN'), ('World', 'NN')])

['Hello', 'World']

### Using built in pos_tag function

In [10]:
from nltk import pos_tag
print(pos_tag(word_tokenize('The boy ate the amazingly delicious cake')))

[('The', 'DT'), ('boy', 'NN'), ('ate', 'VB'), ('the', 'DT'), ('amazingly', 'RB'), ('delicious', 'JJ'), ('cake', 'NN')]


### Checking the english meaning of the tags

In [11]:
from nltk import help
help.brown_tagset('JJ')

JJ: adjective
    ecent over-all possible hard-fought favorable hard meager fit such
    widespread outmoded inadequate ambiguous grand clerical effective
    orderly federal foster general proportionate ...


### Loading the brown corpus

In [12]:
from nltk.corpus import brown

all_tagged = brown.tagged_sents()
print(all_tagged[0:2]) # The first two sentences in the tagged corpus: List of a list of (word, tag) pairs
print('Number of sents in brown corpus', len(all_tagged))
print('Number of tokens in brown corpus', sum([len(sent) for sent in all_tagged]))


[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

### Splitting tagged sentences into training and testing

In [17]:
train = all_tagged[:500] # First 500 sentences for training
test = all_tagged[500:] # Rest for testing

### Using the classifier based POS tagger

The ClassifierBasedPOSTagger class uses classification to do part-of-speech tagging.
Features are extracted from words, and then passed to an internal classifier. The classifier
classifies the features and returns a label, in this case, a part-of-speech tag.

***PS: RUNNING THIS NEXT CELL IS GOING TO TAKE A WHILE***

In [14]:
from nltk.tag.sequential import ClassifierBasedPOSTagger
tagger = ClassifierBasedPOSTagger(train=train, verbose=True)
tagger.evaluate(test)

Constructing training corpus for classifier.
Training classifier (11711 instances)


0.7795781383996306

### Test the classifier on a random sentence

In [15]:
print(tagger.tag(word_tokenize('He watched the play')))

[('He', 'PPS'), ('watched', 'VBD'), ('the', 'AT'), ('play', 'NN')]


### TASK: Build a classifier based POS tagger for the gutenberg corups