-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Python interface to the mappings (works in Python 2.7)
- Loading branch information
neatnate
committed
May 7, 2011
1 parent
62d946a
commit f72b57a
Showing
1 changed file
with
100 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
''' | ||
Interface for converting POS tags from various treebanks | ||
to the universal tagset of Petrov, Das, & McDonald. | ||
The tagset consists of the following 12 coarse tags: | ||
VERB - verbs (all tenses and modes) | ||
NOUN - nouns (common and proper) | ||
PRON - pronouns | ||
ADJ - adjectives | ||
ADV - adverbs | ||
ADP - adpositions (prepositions and postpositions) | ||
CONJ - conjunctions | ||
DET - determiners | ||
NUM - cardinal numbers | ||
PRT - particles or other function words | ||
X - other: foreign words, typos, abbreviations | ||
. - punctuation | ||
@see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/ | ||
@author: Nathan Schneider (nschneid) | ||
@since: 2011-05-06 | ||
''' | ||
|
||
# Strive towards Python 3 compatibility | ||
from __future__ import print_function, unicode_literals, division | ||
from future_builtins import map, filter | ||
|
||
import re, glob | ||
from collections import defaultdict | ||
|
||
MAP_DIR = 'universal_pos_tags.1.01' | ||
|
||
COARSE_TAGS = ('VERB','NOUN','PRON','ADJ','ADV','ADP','CONJ','DET','NUM','PRT','X','.') | ||
|
||
_MAPS = defaultdict(dict) | ||
|
||
def readme(): | ||
with open(MAP_DIR+'/README') as f: | ||
return f.read() | ||
|
||
def fileids(lang=''): | ||
''' | ||
Optionally given a two-letter ISO language code, returns names of files | ||
containing mappings from a tagset from a treebank in that language to the | ||
universal tagset. | ||
>>> fileids('en') | ||
[u'en-ptb'] | ||
>>> fileids('zh') | ||
[u'zh-ctb6', u'zh-sinica'] | ||
''' | ||
return [re.match(r'.*[/]([^/\\]+)[.]map', p).group(1) for p in glob.glob(MAP_DIR + '/{}-*.map'.format(lang.lower()))] | ||
|
||
def _read(fileid): | ||
with open(MAP_DIR+'/'+fileid+'.map') as f: | ||
for ln in f: | ||
ln = ln.strip() | ||
if ln=='': continue | ||
fine, coarse = ln.split('\t') | ||
assert coarse in COARSE_TAGS,'Unexpected coarse tag: {}'.format(coarse) | ||
assert fine not in _MAPS[fileid],'Multiple entries for original tag: {}'.format(fine) | ||
_MAPS[fileid][fine] = coarse | ||
|
||
def mapping(fileid): | ||
''' | ||
Retrieves the mapping from original tags to universal tags for the | ||
treebank in question. | ||
>>> mapping('ru-rnc')=={'!': '.', 'A': 'ADJ', 'AD': 'ADV', 'C': 'CONJ', 'COMP': 'CONJ', 'IJ': 'X', 'NC': 'NUM', 'NN': 'NOUN', 'P': 'PRON', 'PTCL': 'PRT', 'V': 'VERB', 'VG': 'VERB', 'VI': 'VERB', 'VP': 'VERB', 'YES_NO_SENT': 'X', 'Z': 'X'} | ||
True | ||
''' | ||
if fileid not in _MAPS: | ||
_read(fileid) | ||
return _MAPS[fileid] | ||
|
||
def convert(fileid, originalTag): | ||
''' | ||
Produces the (coarse) universal tag given an original POS tag from the | ||
treebank in question. | ||
>>> convert('en-ptb', 'VBZ') | ||
u'VERB' | ||
>>> convert('en-ptb', 'VBP') | ||
u'VERB' | ||
>>> convert('en-ptb', '``') | ||
u'.' | ||
''' | ||
return mapping(fileid)[originalTag] | ||
|
||
|
||
def test(): | ||
for fileid in fileids(): | ||
mapping(fileid) | ||
import doctest | ||
doctest.testmod() | ||
|
||
if __name__=='__main__': | ||
test() |