In [1]:
import pandas as pd
import numpy as np
import nltk
%matplotlib inline

In [2]:
text = nltk.tokenize.word_tokenize("Once upon time there was a hippo named Alfred.")
text

['Once', 'upon', 'time', 'there', 'was', 'a', 'hippo', 'named', 'Alfred', '.']

In [3]:
nltk.pos_tag(text)

[('Once', 'RB'),
 ('upon', 'IN'),
 ('time', 'NN'),
 ('there', 'EX'),
 ('was', 'VBD'),
 ('a', 'DT'),
 ('hippo', 'NN'),
 ('named', 'VBN'),
 ('Alfred', 'NNP'),
 ('.', '.')]

In [4]:
# tagger gets confused
text = nltk.tokenize.word_tokenize("Hey, can I have that can of olives?")
#text = nltk.tokenize.word_tokenize("Can I have that can of olives?")
nltk.pos_tag(text)

[('Hey', 'PRP'),
 (',', ','),
 ('can', 'MD'),
 ('I', 'PRP'),
 ('have', 'VB'),
 ('that', 'IN'),
 ('can', 'MD'),
 ('of', 'IN'),
 ('olives', 'NNS'),
 ('?', '.')]

In [5]:
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdit = nltk.FreqDist(w.lower() for w in news_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will', 'butter']
for m in modals:
    print m + ':', fdit[m]

can: 94
could: 87
may: 93
might: 38
must: 53
will: 389
butter: 2


In [6]:
s = "Once upon a time there was a hippo named alfed".split()
list(nltk.bigrams(s))

[('Once', 'upon'),
 ('upon', 'a'),
 ('a', 'time'),
 ('time', 'there'),
 ('there', 'was'),
 ('was', 'a'),
 ('a', 'hippo'),
 ('hippo', 'named'),
 ('named', 'alfed')]

## some stuff from MAS and the nltk intro

In [7]:
brown.readme()

u'BROWN CORPUS\n\nA Standard Corpus of Present-Day Edited American\nEnglish, for use with Digital Computers.\n\nby W. N. Francis and H. Kucera (1964)\nDepartment of Linguistics, Brown University\nProvidence, Rhode Island, USA\n\nRevised 1971, Revised and Amplified 1979\n\nhttp://www.hit.uib.no/icame/brown/bcm.html\n\nDistributed with the permission of the copyright holder,\nredistribution permitted.\n'

In [8]:
from nltk.corpus import gutenberg
gutenberg.fileids()
words = gutenberg.words('melville-moby_dick.txt')
words

[u'[', u'Moby', u'Dick', u'by', u'Herman', u'Melville', ...]

In [9]:
mobydick = nltk.Text(words)
mobydick.collocations()

Sperm Whale; Moby Dick; White Whale; old man; Captain Ahab; sperm
whale; Right Whale; Captain Peleg; New Bedford; Cape Horn; cried Ahab;
years ago; lower jaw; never mind; Father Mapple; cried Stubb; chief
mate; white whale; ivory leg; one hand


In [10]:
freqs = mobydick.vocab()

In [11]:
probs = nltk.MLEProbDist(freqs)

In [12]:
probs.prob('whale')

0.003473673313677301

In [73]:
probs.prob('vanillafrog')

0.0

#### with smoothing:

In [14]:
probs_d = nltk.WittenBellProbDist(freqs, freqs.B()+1)

In [15]:
probs_d

<WittenBellProbDist based on 260819 samples>

In [16]:
# this made up word should have the same prob as words with word count 1 in the corpus
probs_d.prob('vanillafrog')

0.06895579290059115

#### try to make a bigram distr and use it to make a prediction

In [17]:
mobydick_bg = nltk.bigrams(words)

In [18]:
mobydick_bg_fr = nltk.FreqDist(mobydick_bg)

In [19]:
# for k, v in mobydick_bg_fr.items():
#     print k, v

In [20]:
mobydick_bg_fr[(u'white', u'whale')]

31

In [21]:
probs_bg = nltk.MLEProbDist(mobydick_bg_fr)

In [22]:
probs_bg.prob((u'white', u'whale'))

0.00011885682736620939

In [23]:
query = "I would like to buy one can of white".split()

In [24]:
matches = []
for bigram in nltk.bigrams(words):
    if bigram[0] == unicode(query[-1]):
        #print bigram, probs_bg.prob(bigram)
        matches.append((bigram, probs_bg.prob(bigram)))  
        #WHY does the same bigram (white, whale) appear svrl times, something is strange
        #ed: no makes sense

In [25]:
matches.sort(key=lambda x: x[1])

In [26]:
matches[-1]

((u'white', u'whale'), 0.00011885682736620939)

In [27]:
print "Most likely query: "
print " ".join(query) + " " + matches[-1][0][1]

Most likely query: 
I would like to buy one can of white whale


#### repeat the same step with a WittenBell distribution instead and try some other 'queries'

In [28]:
#...

####  try and do the same as above with custom rap corpus

a quick and crude parsing, full of garbage tokens like urls and stuff..

In [29]:
import csv
f = open('ohhla.csv')
freader = csv.reader(f)
s = str()

for ix, song in enumerate(freader):
    if ix == 0: continue
    s += ' '.join(song)
    
raptext = nltk.word_tokenize(s)
f.close()

In [30]:
rapcorpus = nltk.Text(raptext)

In [31]:
rap_bgf = nltk.FreqDist(nltk.bigrams(rapcorpus))

In [32]:
rap_bpr = nltk.WittenBellProbDist(rap_bgf, rap_bgf.B() + 1) #why B+1

In [33]:
rap_bgf[('a', 'war')]

6

In [50]:
rap_bpr.prob((('a', 'war')))

0.00016570465906266398

In [35]:
query = "I would like a".split()
matches = []
for bigram in nltk.bigrams(rapcorpus):
    if bigram[0] == unicode(query[-1]):
#         print bigram, probs_bg.prob(bigram)
        matches.append((bigram, rap_bpr.prob(bigram)))  

matches.sort(key=lambda x: x[1])
matches = sorted(set(matches), key=lambda x: x[1])

In [118]:
print "Most likely query: "
print ' '.join(query)
n = 1
for match in matches[:-6:-1]:
    print n ,':', match[0][1]
    n += 1

Most likely query: 
I would like a
1 : nigga
2 : soldier
3 : war
4 : little
5 : million


#### try with a trigram

In [36]:
rap_tgf = nltk.FreqDist(nltk.trigrams(rapcorpus))

In [40]:
rap_tgf[('like', 'a', 'million')]

1

In [41]:
rap_tpr = nltk.WittenBellProbDist(rap_tgf, rap_tgf.B() + 1) #why B+1

In [46]:
rap_tpr.prob(('like', 'a', 'million'))

2.427243379693682e-05

In [51]:
rap_tprML = nltk.MLEProbDist(rap_tgf)

#### Strange:

In [56]:
rap_tpr.prob(('like', 'a', 'millionZZZ'))

0.4579965533144008

In [55]:
rap_tprML.prob(('like', 'a', 'millionZZZ'))

0.0

In [67]:
query = "I would like a".split()
matches = []
for trigram in nltk.trigrams(rapcorpus):
    if trigram[0] == unicode(query[-2]) and trigram[1] == unicode(query[-1]):
        print trigram, probs_bg.prob(bigram)
        matches.append((trigram, rap_tpr.prob(trigram)))  

matches = sorted(set(matches), key=lambda x: x[1])

# print "Most likely query: "
# print ' '.join(query)
# n = 1
# for match in matches[:-6:-1]:
#     print n ,':', match[0][1]
#     n += 1

('like', 'a', 'nine') 0.0
('like', 'a', 'ref') 0.0
('like', 'a', 'Tec..') 0.0
('like', 'a', 'three') 0.0
('like', 'a', 'fiend') 0.0
('like', 'a', 'well-trained') 0.0
('like', 'a', 'pair') 0.0
('like', 'a', 'Predator') 0.0
('like', 'a', 'willow') 0.0
('like', 'a', 'Yokohama') 0.0
('like', 'a', 'moped') 0.0
('like', 'a', 'twenty') 0.0
('like', 'a', 'crab') 0.0
('like', 'a', '75') 0.0
('like', 'a', 'window') 0.0
('like', 'a', 'strainer') 0.0
('like', 'a', 'Kirby') 0.0
('like', 'a', 'rattle') 0.0
('like', 'a', 'million') 0.0
('like', 'a', 'cut') 0.0
('like', 'a', 'live') 0.0


####  ////////////////////////////

In [72]:
?nltk.NgramTagger