Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
345 lines (221 sloc) 6.44 KB
%
% Context-sensitive mapping of PTB POS tags to
% Universal POS tags.
%
% Author: Sebastian Schuster
% Author: Christopher Manning
%
% The original Penn Treebank WSJ contains 45 POS tags (but almost certainly # for British pound currency is a bad idea!)
% {#=173, $=9,039, ''=8,658, ,=60,489, -LRB-=1,672, -RRB-=1,689, .=48,733, :=6,087, CC=29,462, CD=44,937, DT=101,190,
% EX=1,077, FW=268, IN=121,903, JJ=75,266, JJR=4,042, JJS=2,396, LS=64, MD=11,997, NN=163,935, NNP=114,053,
% NNPS=3,087, NNS=73,964, PDT=441, POS=10,801, PRP=21,357, PRP$=10,241, RB=38,197, RBR=2,175, RBS=555, RP=3,275,
% SYM=70, TO=27,449, UH=117, VB=32,565, VBD=37,493, VBG=18,239, VBN=24,865, VBP=15,377, VBZ=26,436, WDT=5,323,
% WP=2,887, WP$=219, WRB=2,625, ``=8,878}
%
% The Web Treebank corpus adds 6 tags, but doesn't have #, yielding 50 POS tags:
% ADD, AFX, GW, HYPH, NFP, XX
%
% OntoNotes 4.0 has 53 tags. It doesn't have # but adds: -LSB-, -RSB- [both mistakes!], ADD, AFX, CODE, HYPH, NFP,
% X [mistake!], XX.
%
%
% ------------------------------
% Context-sensitive mappings
%
% TO -> PART (in CONJP phrases)
@CONJP < TO=target < VB
relabel target PART
% TO -> PART
@VP < @VP < (/^TO$/=target <... {/.*/})
relabel target PART
% TO -> PART
@VP <: (/^TO$/=target <... {/.*/})
relabel target PART
% TO -> ADP (otherwise)
TO=target <... {/.*/}
relabel target ADP
% Don't do this, we are now treating these as copular constructions
% VB.* -> AUX (for passives where main verb is part of an ADJP)
%@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase|get|got|getting|gets|gotten)$/ ) < (@ADJP [ < VBN|VBD | < (@VP|ADJP < VBN|VBD) < CC ] )
%
%relabel target AUX
%
% VB.* -> AUX (for cases with fronted main VPs)
@SINV < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ ) $-- (@VP < VBD|VBN))
relabel target AUX
% VB.* -> AUX (another, rarer case of fronted VPs)
@SINV < (@VP < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ )) $-- (@VP < VBD|VBN))
relabel target AUX
% VB.* -> AUX (passive, case 2)
%SQ|SINV < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ $++ (VP < VBD|VBN))
%
%relabel target AUX
% VB.* -> AUX (active, case 1)
VP < VP < (/^VB.*$/=target <: /^(?i:will|have|can|would|do|is|was|be|are|has|could|should|did|been|may|were|had|'ll|'ve|does|am|might|ca|'m|being|'s|must|'d|'re|wo|shall|get|ve|s|got|r|m|getting|having|d|re|ll|wilt|v|of|my|nt|gets|du|wud|woud|with|willl|wil|wase|shoul|shal|`s|ould|-ll|most|made|hvae|hav|cold|as|art|ai|ar|a)$/)
relabel target AUX
% VB -> AUX (active, case 2)
@SQ|SINV < (/^VB/=target $++ /^(?:VP)/ <... {/.*/})
relabel target AUX
% VB.* -> VERB
/^VB.*/=target <... {/.*/}
relabel target VERB
% IN -> SCONJ (subordinating conjunctions)
/^SBAR(-[^ ]+)?$/ < (IN=target $++ @S|FRAG|SBAR|SINV <... {/.*/})
relabel target SCONJ
% IN -> SCONJ (subordinating conjunctions II)
@PP < (IN=target $+ @SBAR|S)
relabel target SCONJ
% IN -> ADP (otherwise)
IN=target < __
relabel target ADP
% NN -> SYM (in case of the percent sign)
NN=target <... {/\\%/}
relabel target SYM
% fused det-noun pronouns -> PRON
NN=target < (/^(?i:(somebody|something|someone|anybody|anything|anyone|everybody|everything|everyone|nobody|nothing))$/)
relabel target PRON
% NN -> NOUN (otherwise)
NN=target <... {/.*/}
relabel target NOUN
% NFP -> PUNCT (in case of possibly repeated hyphens, asterisks or tildes)
NFP=target <... {/^(~+|\*+|\-+)$/}
relabel target PUNCT
% NFP -> SYM (otherwise)
NFP=target <... {/.*/}
relabel target SYM
% RB -> PART when it is verbal negation (not or its reductions)
@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)
relabel target PART
% Otherwise RB -> ADV
RB=target <... {/.*/}
relabel target ADV
% DT -> PRON (pronominal this/that/these/those)
@NP <: (DT=target < /^(?i:th(is|at|ose|ese))$/)
relabel target PRON
%DT -> DET
DT=target < __
relabel target DET
% WDT -> PRON (pronominal that/which)
@WHNP|NP <: (WDT=target < /^(?i:(that|which))$/)
relabel target PRON
% WDT->SCONJ (incorrectly tagged subordinating conjunctions)
@SBAR < (WDT=target < /^(?i:(that|which))$/)
relabel target SCONJ
% WDT -> DET
WDT=target <... {/.*/}
relabel target DET
% ------------------------------
% 1 to 1 mappings
%
%
% CC -> CONJ
CC=target <... {/.*/}
relabel target CONJ
% CD -> NUM
CD=target <... {/.*/}
relabel target NUM
% EX -> PRON
EX=target <... {/.*/}
relabel target PRON
% FW -> X
FW=target <... {/.*/}
relabel target X
% JJ.* -> ADJ
/^JJ.*$/=target < __
relabel target ADJ
% LS -> X
LS=target <... {/.*/}
relabel target X
% MD -> AUX
MD=target <... {/.*/}
relabel target AUX
% NNS -> NOUN
NNS=target <... {/.*/}
relabel target NOUN
% NNP -> PROPN
NNP=target <... {/.*/}
relabel target PROPN
% NNPS -> PROPN
NNPS=target <... {/.*/}
relabel target PROPN
% PDT -> DET
PDT=target <... {/.*/}
relabel target DET
% POS -> PART
POS=target <... {/.*/}
relabel target PART
% PRP -> PRON
PRP=target <... {/.*/}
relabel target PRON
% PRP$ -> PRON
/^PRP\$$/=target <... {/.*/}
relabel target PRON
% RBR -> ADV
RBR=target <... {/.*/}
relabel target ADV
% RBS -> ADV
RBS=target <... {/.*/}
relabel target ADV
% RP -> ADP
RP=target <... {/.*/}
relabel target ADP
% UH -> INTJ
UH=target <... {/.*/}
relabel target INTJ
% WP -> PRON
WP=target <... {/.*/}
relabel target PRON
% WP$ -> PRON
/^WP\$$/=target <... {/.*/}
relabel target PRON
% WRB -> ADV
WRB=target <... {/.*/}
relabel target ADV
% `` -> PUNCT
/^``$/=target <... {/.*/}
relabel target PUNCT
% '' -> PUNCT
/^''$/=target < __
relabel target PUNCT
% ( -> PUNCT
/^\($/=target <... {/.*/}
relabel target PUNCT
% ) -> PUNCT
/^\)$/=target <... {/.*/}
relabel target PUNCT
% -LRB- -> PUNCT
/^-LRB-$/=target <... {/.*/}
relabel target PUNCT
% -RRB- -> PUNCT
/^-RRB-$/=target <... {/.*/}
relabel target PUNCT
% , -> PUNCT
/^,$/=target <... {/.*/}
relabel target PUNCT
% . -> PUNCT
/^\.$/=target <... {/.*/}
relabel target PUNCT
% : -> PUNCT
/^:$/=target <... {/.*/}
relabel target PUNCT
% HYPH -> PUNCT
HYPH=target <... {/.*/}
relabel target PUNCT
% # -> SYM
/^#$/=target <... {/.*/}
relabel target SYM
% $ -> SYM. Also note that there is a no-op rule of SYM -> SYM!
/^\$$/=target <... {/.*/}
relabel target SYM
% ADD -> X
ADD=target <... {/.*/}
relabel target X
% AFX -> X
AFX=target <... {/.*/}
relabel target X
% GW -> X
GW=target <... {/.*/}
relabel target X
% XX -> X
XX=target <... {/.*/}
relabel target X