Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
72 lines (54 sloc) 2.95 KB
=========================
Treebank Tagged Sentences
=========================
>>> from nltk.corpus import treebank
>>> treebank.tagged_sents()[0]
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
>>> import nltk.data
>>> from nltk import tag
>>> tagger = nltk.data.load(tag._POS_TAGGER)
>>> sent = treebank.sents()[0]
>>> sent
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
>>> tagger.tag(sent)
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
>>> tagger.evaluate(treebank.tagged_sents())
0.9956891414041082
# this is self-accurate and doesn't reflect real-world performance
>>> simple_sents = treebank.tagged_sents(simplify_tags=True)
>>> simple_sents[0]
[('Pierre', 'NP'), ('Vinken', 'NP'), (',', ','), ('61', 'NUM'), ('years', 'N'), ('old', 'ADJ'), (',', ','), ('will', 'MOD'), ('join', 'V'), ('the', 'DET'), ('board', 'N'), ('as', 'P'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'N'), ('Nov.', 'NP'), ('29', 'NUM'), ('.', '.')]
===================================
Training Sequential Backoff Taggers
===================================
>>> default = tag.DefaultTagger('N')
>>> default.evaluate(simple_sents)
0.19083992212642537
>>> u = tag.UnigramTagger(simple_sents, backoff=default)
>>> u.tag(sent)
[('Pierre', 'NP'), ('Vinken', 'NP'), (',', ','), ('61', 'NUM'), ('years', 'N'), ('old', 'ADJ'), (',', ','), ('will', 'MOD'), ('join', 'V'), ('the', 'DET'), ('board', 'N'), ('as', 'P'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'N'), ('Nov.', 'NP'), ('29', 'NUM'), ('.', '.')]
>>> u.evaluate(simple_sents)
0.9656621240414796
# these numbers are self-accurate and won't reflect real-world performance
# for accuracy measurements, separate simple_sents into training & testing lists
>>> ub = tag.BigramTagger(simple_sents, backoff=u)
>>> ub.evaluate(simple_sents)
0.9861635345067344
>>> ubt = tag.TrigramTagger(simple_sents, backoff=ub)
>>> ubt.evaluate(simple_sents)
0.991914656919226
===============
Saving a Tagger
===============
>>> import os, os.path, pickle
>>> path = os.path.expanduser('~/nltk_data/taggers/')
>>> os.makedirs(path)
>>> f = open(os.path.join(path, 'ubt.pickle'), 'wb')
>>> pickle.dump(ubt, f)
==================================
Training a Classifier Based Tagger
==================================
>>> c = tag.ClassifierBasedPOSTagger(train=simple_sents)
>>> c.evaluate(simple_sents)
0.9773530930907068
# a classifier based tagger is also much slower than sequential backoff taggers