forked from elishowk/cablegate_semnet
-
Notifications
You must be signed in to change notification settings - Fork 0
/
buildtagger.py
142 lines (123 loc) · 5.11 KB
/
buildtagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Copyright (C) 2010 elishowk
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
__author__="Elias Showk"
# warning : nltk imports it's own copy of pyyaml
import nltk.corpus
import nltk.tag
import itertools
import string
#from nltk.tag import brill
from nltk import pos_tag
import cPickle as pickle
import logging
logging.basicConfig(level=logging.DEBUG, format="%(levelname)-8s %(message)s")
class SequentialPosTagger():
"""
integration of nltk Part Of Speech taggers
static posTag method implemets default MaxentClassifier algorithm tagger
needs nltk data 'taggers/maxent_treebank_pos_tagger/english.pickle'
otherwise, instance of this class provides a self trained and composite tagger
thanks to http://streamhacker.com/2010/04/12/pos-tag-nltk-brill-classifier/
needs nltk data 'corpus/brown' and 'corpus/conll2000'
"""
# default word patterns for the RegexpTagger
word_patterns = [
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'.*ould$', 'MD'),
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*ness$', 'NN'),
(r'.*ment$', 'NN'),
(r'.*ful$', 'JJ'),
(r'.*ious$', 'JJ'),
(r'.*ble$', 'JJ'),
(r'.*ic$', 'JJ'),
(r'.*ive$', 'JJ'),
(r'.*ic$', 'JJ'),
(r'.*est$', 'JJ'),
(r'^a$', 'PREP'),
(r'['+string.punctuation+']', 'PUN'),
]
def __init__(self, training_corpus_size, trained_pickle):
"""
Prepares the hybrid tagger
Composes training sentences
"""
if trained_pickle is not None:
self._loading(training_corpus_size, trained_pickle)
else:
self._training(training_corpus_size, trained_pickle)
def _loading(self, training_corpus_size, trained_pickle):
try:
self.tagger = pickle.load(file(trained_pickle,'r'))
except pickle.PickleError, pickerr:
_logger.warning("%s"%pickerr)
self._training(training_corpus_size, trained_pickle)
except IOError, ioerr:
_logger.warning("Pickled tagger %s not found, will train its own tagger"%trained_pickle)
self._training(training_corpus_size, trained_pickle)
def _training(self, training_corpus_size, trained_pickle):
# get the training sentences
_logger.debug( "Training and saving a new tagger" )
if training_corpus_size is None:
brown_train = list(nltk.corpus.brown.tagged_sents())
conll_train = list(nltk.corpus.conll2000.tagged_sents())
else:
brown_train = list(nltk.corpus.brown.tagged_sents()[:training_corpus_size])
conll_train = list(nltk.corpus.conll2000.tagged_sents()[:training_corpus_size])
train_sents = list(itertools.chain( brown_train, conll_train ))
# base tagger classes for initial tagger
tagger_classes = [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger]
backoff = nltk.tag.RegexpTagger(self.word_patterns)
self.tagger = self._backoff(train_sents, tagger_classes, backoff=backoff)
pickle.dump(self.tagger, file(trained_pickle,'w+'))
def _backoff(self, tagged_sents, tagger_classes, backoff=None):
"""Init backoff classes, takes a lot of time to train..."""
if not backoff:
backoff = tagger_classes[0](tagged_sents)
del tagger_classes[0]
for cls in tagger_classes:
tagger = cls(tagged_sents, backoff=backoff)
backoff = tagger
return backoff
@staticmethod
def posTag( tokens ):
"""
ALTERNATE TAGGER
gives access to the default nltk pos tagger (slower but smarter)
each tokens becomes ['token','TAG']
"""
return map( list, pos_tag( tokens ) )
def _normalize(self,tagtok):
"""returns an empty string when no tag was found"""
tagtok = list(tagtok)
if tagtok[1] is None:
tagtok[1] = '?'
return tagtok
def tag(self, tokens):
"""returns the tagged sentence using trained self.tagger"""
tag_tokens = map(
self._normalize,
self.tagger.tag( tokens )
)
return tag_tokens
@staticmethod
def getContent( sentence ):
"""return words from a tagged list like [["the","DET"],["python","NN"]]"""
return [tagged[0] for tagged in sentence]
@staticmethod
def getTag( sentence ):
"""return TAGS from a tagged list like [["the","DET"],["python","NN"]]"""
return [tagged[1] for tagged in sentence]