# n-gram Languag Modelling 

In [1]:
import nltk

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tag import pos_tag

def preprocessSent(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

def preprocessText(text, tag = True):
# Segment text into sentences
    sent = sent_tokenize(text)
# Tokenize each sentences
    sent = [nltk.word_tokenize(s) for s in sent]
# Part-of-speech tagging each sentences
    if(tag == True) : sent = [nltk.pos_tag(s) for s in sent]
    return sent


## Load NLTK books

In [2]:
# import nltk resources
import nltk
from nltk.book import *


*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [3]:
## convert text1 into sentence segmented, tokenized list of lists 
text1_sent = preprocessText(" ".join(text1), tag=False) # no tagging
text1_sent[:2]

[['[',
  'Moby',
  'Dick',
  'by',
  'Herman',
  'Melville',
  '1851',
  ']',
  'ETYMOLOGY',
  '.'],
 ['(',
  'Supplied',
  'by',
  'a',
  'Late',
  'Consumptive',
  'Usher',
  'to',
  'a',
  'Grammar',
  'School',
  ')',
  'The',
  'pale',
  'Usher',
  '--',
  'threadbare',
  'in',
  'coat',
  ',',
  'heart',
  ',',
  'body',
  ',',
  'and',
  'brain',
  ';',
  'I',
  'see',
  'him',
  'now',
  '.']]

n-gram LM for sentence $S: a b c d ... $ 

Probability: 
- $p(d | abc ) = \frac{\mbox{count}(abcd)} { \mbox{count}(abc)}$

Self-Information:
- $I(d | abc ) = log_2(\frac{\mbox{count}(abc)}{\mbox{count}(abcd)})$

Entropy of $S$:
- $H(S) = 1/n \sum_{abcd \in S} I(d | abc)$

Perplexity of $S$:
- $PP(S) = 2^{H(S)}$



In [4]:
import numpy as np

#############################################
# compute n-grams dictionary for source text

N = 2        # length of n-gram

# data: list of sentences 
def nGramCount(data, N=2):
    gramsC = {}  # dictionary to store n-gram counts
    for seg in data:
        
        itm = seg.copy()
            
        # insert N-1 sentence starting and a sentence ending symbols
        for i in range(N-1) :
            itm.insert(0, "///")
        itm.append("///")

        # Dictionary:  == count(a) -> count(ab)  
        for i in range(len(itm) - N + 1):
            # produce N-1 gram (a)
            a  = ' '.join(itm[i:i+N-1]).lower()
            # produce ngram (ab)
            ab = ' '.join(itm[i:i+N]).lower()
            # initialize dictionary
            gramsC.setdefault(a, {})
            gramsC[a].setdefault(ab, 0)
            # count the ngram
            gramsC[a][ab] += 1
            
    return(gramsC)


# compute information of dictionary n-grams 
# I(b | a ) == log2(count(a) / count(ab))
def nGramInfo(gramsC) :
    nGrams = {}  # final n-gram dictionary with log-prob entries
    nMin = 0
    for a in gramsC:
        # v : number of a n-grams 
        v = float(sum(gramsC[a].values()))
        for ab in gramsC[a]:
            
            # Information: np.log2(v/gramG[b][ab]) == - np.log2(gramG[b][ab]/v)
            nGrams[ab] = np.log2(v/gramsC[a][ab])
            
            if(nGrams[ab] > nMin): nMin = nGrams[ab]

    # add a out-of-vocabulary probability
    nGrams["|||OOV|||"] = nMin+1
    return(nGrams)


# return list self-information per words 
def wordInfo(seg, nGrams, N=2, verbose = False):
    inf = [] # list of word information value
    itm = seg.copy()
        
    #adjust starting symbol for different length of n-grams N
    for i in range(N-1): 
        itm.insert(0, "///")        
    itm.append("///")

    # loop over all words for all n-grams
    for i in range(len(itm) - N + 1):
        
        # compute n-gram ab
        ab = ' '.join(itm[i:i+N]).lower()
        
        # check whether n-gram is in LM and get self information I(b | a)
        try: 
            if(verbose) : print("nGram: {:10}\t{:4.4}".format(ab, nGrams[ab]))
            inf.append(nGrams[ab])
        except:
            if(verbose) : print("nGOOV: {:10}\t{:4.4}".format(ab, nGrams["|||OOV|||"]))
            inf.append(nGrams["|||OOV|||"])
    return(inf)

# compute perplexity from list of word information
def perplexity(info):
    return(2**(sum(info) / float(len(info))))

# count number of occurances of n-gram g in LM 
def countOcc(g, LM):
    return (sum(LM[g].values()) )


In [6]:
## make a 2-gram languag model
# store n-grams counts in dictionary 

text1_count2 = nGramCount(text1_sent)

# compute information values from counts
text1_info2 = nGramInfo(text1_count2)


print("count: f('pains' | *):   \t", countOcc('pains',text1_count2))
print("count: f('more'  | *):   \t", countOcc('more', text1_count2))
print("count: f('pains' | 'more'):\t", text1_count2['more']['more pains'])
print("info:  I('pains' | 'more'):\t", text1_info2['more pains'])

print("\ncount: f('much' | *):    \t", countOcc('much', text1_count2))
print("count: f('more' | 'much'):\t", text1_count2['much']['much more'])
print("info:  I('more' | 'much'):\t", text1_info2['much more'])


count: f('pains' | *):   	 15
count: f('more'  | *):   	 508
count: f('pains' | 'more'):	 1
info:  I('pains' | 'more'):	 8.988684686772165

count: f('much' | *):    	 223
count: f('more' | 'much'):	 14
info:  I('more' | 'much'):	 3.9935449778627006


In [7]:
## make an 3-gram information dictionary p(c | ab)

text1_count3 = nGramCount(text1_sent, N=3)
text1_info3 = nGramInfo(text1_count3)

print("count: f('much the' | *):     \t", countOcc('much the', text1_count3))
print("count: f('more' | 'much the'):\t", text1_count3['much the']['much the more'])
print("count: f('speed'| 'much the'):\t", text1_count3['much the']['much the speed'])

print("\ninfo:  I('more' | 'much the'):\t", text1_info3['much the more'])
print("info:  I('speed'| 'much the'):\t", text1_info3['much the speed'])

count: f('much the' | *):     	 20
count: f('more' | 'much the'):	 9
count: f('speed'| 'much the'):	 1

info:  I('more' | 'much the'):	 1.15200309344505
info:  I('speed'| 'much the'):	 4.321928094887363


In [8]:
## make an 4-gram information dictionary p(d | abc)

text1_count4 = nGramCount(text1_sent, N=4)
text1_info4 = nGramInfo(text1_count4)

print("count: f('much the more' | *):  \t", countOcc('much the more', text1_count4))
print("count: f('a' | 'much the more'):\t", text1_count4['much the more']['much the more a'])
print("count: f('pains' | 'much the more'):\t", text1_count4['much the more']['much the more pains'])

print("\n")
print("info:  I('a' | 'much the more'):\t",  text1_info4['much the more a'])
print("info:  I('pains' | 'much the more'):\t", text1_info4['much the more pains'])


count: f('much the more' | *):  	 9
count: f('a' | 'much the more'):	 2
count: f('pains' | 'much the more'):	 1


info:  I('a' | 'much the more'):	 2.169925001442312
info:  I('pains' | 'much the more'):	 3.169925001442312


## Perplexity

In [9]:
# perplexity per sentence 
sent1_info2 = wordInfo(text1_sent[1], text1_info2, N=2)
perplexity(sent1_info2)

51.693482447884485

In [15]:
sent1_info3 = wordInfo(text1_sent[1], text1_info3, N=3)
perplexity(sent1_info3)

4.165996948635516

In [16]:
sent1_info4 = wordInfo(text1_sent[1], text1_info4, N=4)
perplexity(sent1_info4)

1.541746051768873

## Michelle

In [17]:
michelle = """Hi! I'm Michelle and I'm 22.
I really,really like this guy. He's 27 and everything  I like in a guy. We have so much in common.
We met around three and a half months ago. A week after we met, he texted me and we didn't stop talking for a whole month and a half. We talked day and night, sometimes 'til four in the morning.
Then, he started ignoring me. When that started to  happen, a red flag went up in my head, so I started ignoring him, too. Except I started missing him.
Before I started a new semester, I asked him what was the point of saving my number if he wasn't going to ask me out. (Yes, we haven't gone out on a date yet. We've talked about it, but he doesn't make it happen.)
I told him I wasn't going to have enough time for him, and if he really wanted to go out with me, he should make it happen soon rather than later.
I just don't understand why he hasn't asked me out yet. He gives me the money excuse, or the "every time I want to, something else comes up" excuse.
If he wants to see me he should've done so already... right?"""


In [10]:
# compute perplexity of michelle sentence with non-stemmed LM

# pre-process (sentence/tokenize) michelle text
michelle_sent = preprocessText(michelle, tag=False)
print("Michelle[1]:\t", michelle_sent[1], "\n")

# compute n-gram probabilities for words
michelle_info = wordInfo(michelle_sent[1], text1_info2, verbose=True)

print("\nInformation:\t", michelle_info, "\n\nperplexity:", perplexity(michelle_info))

NameError: name 'preprocessText' is not defined

In [20]:
from nltk.stem.porter import *
import re

# instantiate stemmers 
porter = PorterStemmer()

# Tokenizing for porter:
text1_porter = [[porter.stem(word) for word in sent] for sent in text1_sent]
text1_porter[0]



['[', 'mobi', 'dick', 'by', 'herman', 'melvil', '1851', ']', 'etymolog', '.']

In [22]:
# gnerate LM from stemmed text1
# LM from txt1 tokenized by Porter:
text1_porter_count2 = nGramCount(text1_porter)
text1_porter_info2 = nGramInfo(text1_porter_count2)

print("unstemmed")
print("count: f('pains' | *):    \t", countOcc('pains', text1_count2))
print("count: f('pain' | *):    \t", countOcc('pain', text1_count2))
print("count: f('more' | *):    \t", countOcc('more', text1_count2))
print("count: f('pains' | 'more'):\t", text1_count2['more']['more pains'])
print("probs: p('pains' | 'more'):\t", text1_info2['more pains'])

print("\nporter")
print("count: f('pain' | *):    \t", countOcc('pain', text1_porter_count2))
print("count: f('more' | *):    \t", countOcc('more', text1_porter_count2))
print("count: f('pain' | 'more'):\t", text1_porter_count2['more']['more pain'])
print("count: f('pain' | 'more'):\t", text1_porter_info2['more pain'])


unstemmed
count: f('pains' | *):    	 15
count: f('pain' | *):    	 2
count: f('more' | *):    	 508
count: f('pains' | 'more'):	 1
probs: p('pains' | 'more'):	 8.988684686772165

porter
count: f('pain' | *):    	 25
count: f('more' | *):    	 508
count: f('pain' | 'more'):	 1
count: f('pain' | 'more'):	 8.988684686772165


In [24]:
sent1_probs2        = wordInfo(text1_sent[1], text1_info2)
sent1_porter_probs2 = wordInfo(text1_porter[1], text1_porter_info2)

print("perplexity:", perplexity(sent1_probs2), 
      "\tporter PP:", perplexity(sent1_porter_probs2))



perplexity: 51.693482447884485 	porter PP: 56.325385841015915


In [25]:
# Michelle with porter stemmer
michelle_porter = [[porter.stem(word) for word in sent] for sent in michelle_sent]
print("Michelle[1]:\t", michelle_sent[1], "\nPorter[1]:\t", michelle_porter[1])

michelle_porter_info = wordInfo(michelle_porter[1], text1_porter_info2, verbose=True)
print("\nInformation:\t", michelle_porter_info, "\n\nPerplexity:", perplexity(michelle_porter_info))


Michelle[1]:	 ['I', "'m", 'Michelle', 'and', 'I', "'m", '22', '.'] 
Porter[1]:	 ['I', "'m", 'michel', 'and', 'I', "'m", '22', '.']
nGram: /// i     	4.674
nGOOV: i 'm      	15.23
nGOOV: 'm michel 	15.23
nGOOV: michel and	15.23
nGram: and i     	6.176
nGOOV: i 'm      	15.23
nGOOV: 'm 22     	15.23
nGOOV: 22 .      	15.23
nGram: . ///     	0.2617

Information:	 [4.67401206866744, 15.23099611713346, 15.23099611713346, 15.23099611713346, 6.176439322543313, 15.23099611713346, 15.23099611713346, 15.23099611713346, 0.2617297369843345] 

Perplexity: 2681.2391764281415


# Tasks 
prepare LM in different ways and test on michelle text:
- use your stemmer 
- use NER
- add more data (text2 ... text9) to LM 
- apply to michelle text

compare perplexity scores


In [1]:
import import_ipynb


In [7]:
from stemming import * #myStemmer2(wrd, pos)

In [8]:
#from NE import *

In [9]:
print(len(michelle_sent))

NameError: name 'michelle_sent' is not defined