In [1]:
# Classification tasks with different types of features
>>> import nltk

In [2]:
# classify part of speech based on sentence context
>>> from nltk.corpus import brown

In [3]:
# define features for the "i"th word in the sentence, including three types of suffix 
#     and one pre-word
# the pos features function takes the sentence of untagged words and the index of a word i
#   it creates features for word i, including the previous word i-1
def pos_features(sentence, i):    
    features = {"suffix(1)": sentence[i][-1:],
		    "suffix(2)": sentence[i][-2:],
		    "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features 

In [5]:
# look at features of a specific word in a specific sentence
# first sentence of brown corpus
sentence0 = brown.sents()[0]
print(sentence0)
# word 8 of sentence 0
sentence0[8]

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


'investigation'

In [6]:
# pos features of the word 8 
pos_features(sentence0, 8)

{'prev-word': 'an', 'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion'}

In [11]:
# get the POS tagged sentences with categories of news
tagged_sents = brown.tagged_sents(categories='news')
tag_sent0 = tagged_sents[0]
print(tag_sent0)

[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]


In [12]:
# the function nltk.tag.untag will take the tags off
print(nltk.tag.untag(tag_sent0))

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [9]:
# the python enumerate function generates an index number for each item in a list
for i,(word,tag) in enumerate(tag_sent0):
    print (i, word, tag)

0 The AT
1 Fulton NP-TL
2 County NN-TL
3 Grand JJ-TL
4 Jury NN-TL
5 said VBD
6 Friday NR
7 an AT
8 investigation NN
9 of IN
10 Atlanta's NP$
11 recent JJ
12 primary NN
13 election NN
14 produced VBD
15 `` ``
16 no AT
17 evidence NN
18 '' ''
19 that CS
20 any DTI
21 irregularities NNS
22 took VBD
23 place NN
24 . .


In [13]:
# get feature sets of words appearing in the corpus, from untagged sentences.
# and then get their tags from corresponding tagged sentence
# use the Python function enumerate to pair the index numbers with sentence words 
#   for the pos features function
featuresets = []
for tagged_sent in tagged_sents:
	untagged_sent = nltk.tag.untag(tagged_sent)
	for i, (word, tag) in enumerate(tagged_sent):
		featuresets.append( (pos_features(untagged_sent, i), tag) )

In [14]:
# look at the feature sets of the first 10 words
for f in featuresets[:10]:
	print (f)

({'suffix(1)': 'e', 'suffix(2)': 'he', 'suffix(3)': 'The', 'prev-word': '<START>'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ton', 'prev-word': 'The'}, 'NP-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ty', 'suffix(3)': 'nty', 'prev-word': 'Fulton'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'nd', 'suffix(3)': 'and', 'prev-word': 'County'}, 'JJ-TL')
({'suffix(1)': 'y', 'suffix(2)': 'ry', 'suffix(3)': 'ury', 'prev-word': 'Grand'}, 'NN-TL')
({'suffix(1)': 'd', 'suffix(2)': 'id', 'suffix(3)': 'aid', 'prev-word': 'Jury'}, 'VBD')
({'suffix(1)': 'y', 'suffix(2)': 'ay', 'suffix(3)': 'day', 'prev-word': 'said'}, 'NR')
({'suffix(1)': 'n', 'suffix(2)': 'an', 'suffix(3)': 'an', 'prev-word': 'Friday'}, 'AT')
({'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}, 'NN')
({'suffix(1)': 'f', 'suffix(2)': 'of', 'suffix(3)': 'of', 'prev-word': 'investigation'}, 'IN')


In [15]:
# using naive Bayesian as classifier
# split data into a training set and a test set, using a 90%/10% split
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
print(len(train_set))
print(len(test_set))

90499
10055


In [16]:
# train classifier on the training set
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy (this will take a little while)
nltk.classify.accuracy(classifier, test_set)
# the result is reasonable for POS features without the previous tag

0.7891596220785678

In [20]:
### sentence segmentation
sents = nltk.corpus.treebank_raw.sents()
len(sents)
for sent in sents[:7]:
    print (sent)

['.', 'START']
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.']
['Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.']
['.', 'START']
['Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.']
['.', 'START']
['A', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '.']


In [21]:
# initialize an empty token list, an empty boundaries set and offset as the integer 0
tokens = [ ]
boundaries = set()
offset = 0
# make a list of tokens with sentence boundaries
#   the offset is set to the index of a sentence boundary
for sent in nltk.corpus.treebank_raw.sents():
      tokens.extend(sent)
      offset += len(sent)
      boundaries.add(offset - 1)

In [22]:
# look at tokens and boundaries
print(tokens[:40])
print(len(boundaries))
print(0 in boundaries)
print(1 in boundaries)
print(19 in boundaries)
print(20 in boundaries)
for num, tok in enumerate(tokens[:40]):
     print (num, tok, '\t', num in boundaries)

['.', 'START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.', '.', 'START', 'Rudolph']
4193
False
True
False
True
0 . 	 False
1 START 	 True
2 Pierre 	 False
3 Vinken 	 False
4 , 	 False
5 61 	 False
6 years 	 False
7 old 	 False
8 , 	 False
9 will 	 False
10 join 	 False
11 the 	 False
12 board 	 False
13 as 	 False
14 a 	 False
15 nonexecutive 	 False
16 director 	 False
17 Nov 	 False
18 . 	 False
19 29 	 False
20 . 	 True
21 Mr 	 False
22 . 	 False
23 Vinken 	 False
24 is 	 False
25 chairman 	 False
26 of 	 False
27 Elsevier 	 False
28 N 	 False
29 . 	 False
30 V 	 False
31 ., 	 False
32 the 	 False
33 Dutch 	 False
34 publishing 	 False
35 group 	 False
36 . 	 True
37 . 	 False
38 START 	 True
39 Rudolph 	 False


In [23]:
# feature extraction function
# token is a list of words and we get the features of the token at offset i
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
        'prevword': tokens[i-1].lower(),
        'punct': tokens[i],
        'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [25]:
# feature dictionary for the period at index 20
print(tokens[20])
punct_features(tokens,20)

.


{'next-word-capitalized': True,
 'prev-word-is-one-char': False,
 'prevword': '29',
 'punct': '.'}

In [26]:
# Define featuresets of all candidate punctuation
#  (read the list comprehension "outside-in")
Sfeaturesets = [(punct_features(tokens, i), (i in boundaries))
      for i in range(1, len(tokens) - 1)
      if tokens[i] in '.?!']

In [27]:
# look at the feature sets of the first 10 punctuation symbols
for sf in Sfeaturesets[:10]:
	print (sf)

({'next-word-capitalized': False, 'prevword': 'nov', 'punct': '.', 'prev-word-is-one-char': False}, False)
({'next-word-capitalized': True, 'prevword': '29', 'punct': '.', 'prev-word-is-one-char': False}, True)
({'next-word-capitalized': True, 'prevword': 'mr', 'punct': '.', 'prev-word-is-one-char': False}, False)
({'next-word-capitalized': True, 'prevword': 'n', 'punct': '.', 'prev-word-is-one-char': True}, False)
({'next-word-capitalized': False, 'prevword': 'group', 'punct': '.', 'prev-word-is-one-char': False}, True)
({'next-word-capitalized': True, 'prevword': '.', 'punct': '.', 'prev-word-is-one-char': True}, False)
({'next-word-capitalized': False, 'prevword': 'conglomerate', 'punct': '.', 'prev-word-is-one-char': False}, True)
({'next-word-capitalized': True, 'prevword': '.', 'punct': '.', 'prev-word-is-one-char': True}, False)
({'next-word-capitalized': True, 'prevword': 'reported', 'punct': '.', 'prev-word-is-one-char': False}, True)
({'next-word-capitalized': True, 'prevword

In [28]:
# separate into training and test sets with a 90/10 split
size = int(len(Sfeaturesets) * 0.1)
size

594

In [29]:
Strain_set, Stest_set = Sfeaturesets[size:], Sfeaturesets[:size]
Sclassifier = nltk.NaiveBayesClassifier.train(Strain_set)
nltk.classify.accuracy(Sclassifier, Stest_set)

0.936026936026936

In [30]:
# define function to use the trained classifier to label sentences
def segment_sentences(words):
      start = 0
      sents = []
      for i, word in enumerate(words):
          if word in '.?!' and Sclassifier.classify(punct_features(words, i)) == True:
              sents.append(words[start:i+1])
              start = i+1
      if start < len(words):
          sents.append(words[start:])
      return sents

In [31]:
# try it out on a subset of the tokens from the treebank
print(len(tokens))
print(tokens[:50])

tinytokens = tokens[:1000]
for s in segment_sentences(tinytokens):
    print (s)

101797
['.', 'START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.', '.', 'START', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated']
['.']
['START', 'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.']
['.', 'START', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.', '.', 'START', 'A', 'form', 'of', 'asbestos', 'once', 'used', '

In [32]:
# compare to NLKT default sentence tokenizer, which works on raw text instead of tokens
from nltk.tokenize import sent_tokenize

In [33]:
# this sentence segmenter starts with raw text, instead of tokens
rawtext = 'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.  Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.'
sents = nltk.sent_tokenize(rawtext)
sents

['Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.',
 'Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.']

In [34]:
## classify documents based on keywords
from nltk.corpus import movie_reviews
import random

In [35]:
# movie reviews are labeled either positive or negative (by human annotators)
movie_reviews.categories()

['neg', 'pos']

In [36]:
# for each document in movie_reviews, get its words and category (positive/negative)
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
len(documents)

2000

In [38]:
random.shuffle(documents)
# look at the first document - consists of a list of all the words in the review
# followed by the category
print(documents[0])

(['hey', ',', 'i', "'", 've', 'got', 'a', 'great', 'idea', 'for', 'a', 'movie', '!', 'ok', ',', 'here', 'it', 'is', ':', 'we', "'", 'll', 'get', 'tim', 'allen', 'to', 'pull', 'angry', 'faces', 'for', 'about', 'two', 'hours', 'or', 'so', '!', 'what', "'", 's', 'that', '?', 'sounds', 'too', 'boring', '?', 'ok', ',', 'how', 'about', 'this', 'then', ':', 'we', 'still', 'have', 'tim', 'allen', 'pull', 'faces', 'for', 'about', 'two', 'hours', ',', 'but', 'half', 'of', 'them', 'are', 'angry', 'looks', 'and', 'half', 'of', 'them', 'are', '"', 'i', "'", 'm', 'in', 'pain', '!', '"', 'looks', '!', 'what', "'", 's', 'that', '?', 'that', 'still', 'doesn', "'", 't', 'sound', 'funny', 'to', 'you', '?', 'how', 'about', 'if', 'we', 'through', 'in', 'some', '"', 'ewwwww', '!', '"', 'looks', '?', 'still', 'not', 'funny', '?', 'if', 'you', 'answered', '"', 'no', ',', 'that', 'doesn', "'", 't', 'sound', 'funny', '"', ',', 'then', '(', 'ding', 'ding', 'ding', '!', ')', 'you', "'", 're', 'absolutely', 'corre

In [39]:
## use words from all documents to define the word vector for features
# get all words from all movie_reviews and put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
len(all_words)

39768

In [41]:
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(2000)
word_features = [word for (word, freq) in word_items]   # just the words

# look at the first 100 words
print(word_features[:100])

[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life']


In [42]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['contains(%s)' % word] = (word in document_words)
	return features

In [43]:
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]

# the feature sets are 2000 words long - so this is optional
featuresets[0]

({'contains(,)': True,
  'contains(the)': True,
  'contains(.)': True,
  'contains(a)': True,
  'contains(and)': True,
  'contains(of)': True,
  'contains(to)': True,
  "contains(')": True,
  'contains(is)': True,
  'contains(in)': True,
  'contains(s)': True,
  'contains(")': True,
  'contains(it)': True,
  'contains(that)': True,
  'contains(-)': True,
  'contains())': True,
  'contains(()': True,
  'contains(as)': True,
  'contains(with)': False,
  'contains(for)': True,
  'contains(his)': True,
  'contains(this)': True,
  'contains(film)': True,
  'contains(i)': True,
  'contains(he)': False,
  'contains(but)': True,
  'contains(on)': True,
  'contains(are)': True,
  'contains(t)': True,
  'contains(by)': True,
  'contains(be)': True,
  'contains(one)': True,
  'contains(movie)': True,
  'contains(an)': True,
  'contains(who)': True,
  'contains(not)': True,
  'contains(you)': True,
  'contains(from)': True,
  'contains(at)': True,
  'contains(was)': True,
  'contains(have)': True,

In [45]:
# training using naive Baysian classifier with a 90/10 split
train_set, test_set = featuresets[200:], featuresets[:200]
classifier = nltk.NaiveBayesClassifier.train(train_set)

# evaluate the accuracy of the classifier
print (nltk.classify.accuracy(classifier, test_set))
# the accuracy result may vary since we randomized the documents

0.815


In [46]:
# show which features of classifier are most informative
classifier.show_most_informative_features(30)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.8 : 1.0
   contains(wonderfully) = True              pos : neg    =      8.1 : 1.0
         contains(mulan) = True              pos : neg    =      7.6 : 1.0
        contains(seagal) = True              neg : pos    =      7.5 : 1.0
          contains(lame) = True              neg : pos    =      6.0 : 1.0
        contains(poorly) = True              neg : pos    =      5.6 : 1.0
           contains(era) = True              pos : neg    =      5.5 : 1.0
        contains(wasted) = True              neg : pos    =      5.2 : 1.0
         contains(awful) = True              neg : pos    =      5.0 : 1.0
          contains(jedi) = True              pos : neg    =      5.0 : 1.0
         contains(damon) = True              pos : neg    =      5.0 : 1.0
    contains(ridiculous) = True              neg : pos    =      4.6 : 1.0
         contains(waste) = True              neg : pos    =      4.6 : 1.0