Skip to content

Commit

Permalink
Analyzer units for initials. Fixes GH-26.
Browse files Browse the repository at this point in the history
  • Loading branch information
kmike committed Apr 22, 2014
1 parent f539f92 commit 8141ada
Show file tree
Hide file tree
Showing 5 changed files with 256 additions and 16 deletions.
16 changes: 13 additions & 3 deletions pymorphy2/analyzer.py
Expand Up @@ -143,17 +143,27 @@ class MorphAnalyzer(object):

ENV_VARIABLE = 'PYMORPHY2_DICT_PATH'
DEFAULT_UNITS = [
units.DictionaryAnalyzer,
[
units.DictionaryAnalyzer,
units.AbbreviatedFirstNameAnalyzer,
units.AbbreviatedPatronymicAnalyzer,
],

units.NumberAnalyzer,
units.PunctuationAnalyzer,
[units.RomanNumberAnalyzer, units.LatinAnalyzer],
[
units.RomanNumberAnalyzer,
units.LatinAnalyzer
],

units.HyphenSeparatedParticleAnalyzer,
units.HyphenAdverbAnalyzer,
units.HyphenatedWordsAnalyzer,
units.KnownPrefixAnalyzer,
[units.UnknownPrefixAnalyzer, units.KnownSuffixAnalyzer],
[
units.UnknownPrefixAnalyzer,
units.KnownSuffixAnalyzer
],
]

def __init__(self, path=None, result_type=Parse, units=None,
Expand Down
4 changes: 4 additions & 0 deletions pymorphy2/units/__init__.py
Expand Up @@ -18,3 +18,7 @@
NumberAnalyzer,
RomanNumberAnalyzer
)
from .abbreviations import (
AbbreviatedFirstNameAnalyzer,
AbbreviatedPatronymicAnalyzer
)
79 changes: 79 additions & 0 deletions pymorphy2/units/abbreviations.py
@@ -0,0 +1,79 @@
# -*- coding: utf-8 -*-
"""
Analyzer units for abbreviated words
------------------------------------
"""
from __future__ import absolute_import, unicode_literals, division
from pymorphy2.units.base import BaseAnalyzerUnit


class _InitialsAnalyzer(BaseAnalyzerUnit):
SCORE = 0.1
LETTERS = set('АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯ')
TAG_PATTERN = None

def __init__(self, morph):
super(_InitialsAnalyzer, self).__init__(morph)
if self.TAG_PATTERN is None:
raise ValueError("Define TAG_PATTERN in a subclass")
self._tags = self._get_gender_case_tags(self.TAG_PATTERN)

def _get_gender_case_tags(self, pattern):
return [
self.morph.TagClass(pattern % {'gender': gender, 'case': case})
for gender in ['masc', 'femn']
for case in ['nomn', 'gent', 'datv', 'accs', 'ablt', 'loct']
]

def parse(self, word, word_lower, seen_parses):
if word not in self.LETTERS:
return []
return [
(word_lower, tag, word_lower, self.SCORE, ((self, word),))
for tag in self._tags
]

def tag(self, word, word_lower, seen_tags):
if word not in self.LETTERS:
return []
return self._tags[:]


class AbbreviatedFirstNameAnalyzer(_InitialsAnalyzer):
TAG_PATTERN = 'NOUN,anim,%(gender)s,Sgtm,Name,Fixd,Abbr sing,%(case)s'

def __init__(self, morph):
super(AbbreviatedFirstNameAnalyzer, self).__init__(morph)
self._tags_masc = [tag for tag in self._tags if 'masc' in tag]
self._tags_femn = [tag for tag in self._tags if 'femn' in tag]
assert self._tags_masc + self._tags_femn == self._tags

def get_lexeme(self, form):
# 2 lexemes: masc and femn
fixed_word, form_tag, normal_form, score, methods_stack = form
tags = self._tags_masc if 'masc' in form_tag else self._tags_femn
return [
(fixed_word, tag, normal_form, score, methods_stack)
for tag in tags
]

def normalized(self, form):
# don't normalize female names to male names
fixed_word, form_tag, normal_form, score, methods_stack = form
tags = self._tags_masc if 'masc' in form_tag else self._tags_femn
return fixed_word, tags[0], normal_form, score, methods_stack


class AbbreviatedPatronymicAnalyzer(_InitialsAnalyzer):
TAG_PATTERN = 'NOUN,anim,%(gender)s,Sgtm,Patr,Fixd,Abbr sing,%(case)s'

def get_lexeme(self, form):
fixed_word, _, normal_form, score, methods_stack = form
return [
(fixed_word, tag, normal_form, score, methods_stack)
for tag in self._tags
]

def normalized(self, form):
fixed_word, _, normal_form, score, methods_stack = form
return fixed_word, self._tags[0], normal_form, score, methods_stack
52 changes: 52 additions & 0 deletions tests/test_analyzer.py
Expand Up @@ -272,3 +272,55 @@ class TetsPunctuationPredictor:
def test_tag(self):
assert morph.tag('…') == [morph.TagClass('PNCT')]


class TestInitials:

def assertHasFirstName(self, tags):
assert any(set(['Name', 'Abbr']) in tag for tag in tags), tags

def assertHasPatronymic(self, tags):
assert any(set(['Patr', 'Abbr']) in tag for tag in tags), tags

def _filter_parse(self, word, grammemes):
return [p for p in morph.parse(word) if set(grammemes) in p.tag]

def test_tag(self):
tags = morph.tag('Д')
self.assertHasFirstName(tags)
self.assertHasPatronymic(tags)

def test_tag_conj(self):
tags = morph.tag('И')
self.assertHasFirstName(tags)
self.assertHasPatronymic(tags)
assert any('CONJ' in tag for tag in tags), tags

def test_parse(self):
tags = [p.tag for p in morph.parse('И')]
self.assertHasFirstName(tags)
self.assertHasPatronymic(tags)

def test_normalize_name_masc(self):
parse = self._filter_parse('И', ['Name', 'accs', 'masc'])[0]
assert parse.normalized.word == 'и'
assert parse.normalized.tag.case == 'nomn'
assert parse.normalized.tag.gender == 'masc'

def test_normalize_patr_masc(self):
parse = self._filter_parse('И', ['Patr', 'accs', 'masc'])[0]
assert parse.normalized.word == 'и'
assert parse.normalized.tag.case == 'nomn'
assert parse.normalized.tag.gender == 'masc'

def test_normalize_name_femn(self):
parse = self._filter_parse('И', ['Name', 'accs', 'femn'])[0]
assert parse.normalized.word == 'и'
assert parse.normalized.tag.case == 'nomn'
assert parse.normalized.tag.gender == 'femn'

def test_normalize_patr_femn(self):
parse = self._filter_parse('И', ['Patr', 'accs', 'femn'])[0]
assert parse.normalized.word == 'и'
assert parse.normalized.tag.case == 'nomn'
assert parse.normalized.tag.gender == 'masc'

121 changes: 108 additions & 13 deletions tests/test_lexemes.py
Expand Up @@ -3,10 +3,31 @@
import pytest
from .utils import morph


# lexemes are divided by blank lines;
# lines that starts with "#" are comments;
# lines that starts with "XFAIL" excludes lexeme from testing.
LEXEMES = """

def parse_lexemes(lexemes_txt):
lexemes_txt = "".join(
line for line in lexemes_txt.strip().splitlines(True)
if not line.startswith("#")
)
return lexemes_txt.split("\n\n")

def get_lexeme_words(lexeme):
lexeme_words = tuple(lexeme.split())
if lexeme_words[0].startswith('XFAIL'):
pytest.xfail()
return lexeme_words


def parse_full_lexeme(lexeme):
forms = lexeme.strip().splitlines()
return [form.split(None, 1) for form in forms]


LEXEMES = parse_lexemes("""
# =========== noun
кот кота коту кота котом коте
коты котов котам котов котами котах
Expand All @@ -27,7 +48,7 @@
экс-лжекот экс-лжекота экс-лжекоту экс-лжекота экс-лжекотом экс-лжекоте
экс-лжекоты экс-лжекотов экс-лжекотам экс-лжекотов экс-лжекотами экс-лжекотах
# =========== noun with two knoen prefixes
# =========== noun with two known prefixes
экслжекот экслжекота экслжекоту экслжекота экслжекотом экслжекоте экслжекоты
экслжекотов экслжекотам экслжекотов экслжекотами экслжекотах
Expand Down Expand Up @@ -88,26 +109,80 @@
лес-колдун леса-колдуна лесу-колдуну лес-колдуна лесом-колдуном лесе-колдуне
леса-колдуны лесов-колдунов лесам-колдунам леса-колдунов лесами-колдунами лесах-колдунах
""".strip()
""")

LEXEMES = "".join(l for l in LEXEMES.splitlines(True) if not l.startswith("#")).split("\n\n")

LEXEMES_FULL = parse_lexemes("""
# ============ noun, a sanity check
кот NOUN,anim,masc sing,nomn
кота NOUN,anim,masc sing,gent
коту NOUN,anim,masc sing,datv
кота NOUN,anim,masc sing,accs
котом NOUN,anim,masc sing,ablt
коте NOUN,anim,masc sing,loct
коты NOUN,anim,masc plur,nomn
котов NOUN,anim,masc plur,gent
котам NOUN,anim,masc plur,datv
котов NOUN,anim,masc plur,accs
котами NOUN,anim,masc plur,ablt
котах NOUN,anim,masc plur,loct
# =========== adverb
театрально ADVB
по-театральному ADVB
# =========== pronoun with a particle
он-то NPRO,masc,3per,Anph sing,nomn
его-то NPRO,masc,3per,Anph sing,gent
него-то NPRO,masc,3per,Anph sing,gent,Af-p
ему-то NPRO,masc,3per,Anph sing,datv
нему-то NPRO,masc,3per,Anph sing,datv,Af-p
его-то NPRO,masc,3per,Anph sing,accs
него-то NPRO,masc,3per,Anph sing,accs,Af-p
им-то NPRO,masc,3per,Anph sing,ablt
ним-то NPRO,masc,3per,Anph sing,ablt,Af-p
нём-то NPRO,masc,3per,Anph sing,loct,Af-p
# ========== initials
И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,nomn
И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,gent
И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,datv
И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,accs
И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,ablt
И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,loct
И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,nomn
И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,gent
И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,datv
И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,accs
И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,ablt
И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,loct
И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,nomn
И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,gent
И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,datv
И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,accs
И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,ablt
И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,loct
И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,nomn
И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,gent
И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,datv
И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,accs
И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,ablt
И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,loct
""")

def _parse_lexeme(lexeme):
lexeme_words = tuple(lexeme.split())
if lexeme_words[0].startswith('XFAIL'):
pytest.xfail()
return lexeme_words


# ============ Tests:

@pytest.mark.parametrize("lexeme", LEXEMES)
def test_has_proper_lexemes(lexeme):
"""
Check if the lexeme of first word in the lexeme is the same lexeme.
Check if the lexeme of the first word in the lexeme is the same lexeme.
"""
lexeme_words = _parse_lexeme(lexeme)
lexeme_words = get_lexeme_words(lexeme)

variants = _lexemes_for_word(lexeme_words[0])
if lexeme_words not in variants:
Expand All @@ -120,7 +195,7 @@ def test_lexemes_sanity(lexeme):
"""
Check if parse.lexeme works properly by applying it several times.
"""
lexeme_words = _parse_lexeme(lexeme)
lexeme_words = get_lexeme_words(lexeme)

for word in lexeme_words:
for p in morph.parse(word):
Expand All @@ -132,7 +207,7 @@ def test_normalized_is_first(lexeme):
"""
Test that parse.normalized is a first form in lexeme.
"""
lexeme_words = _parse_lexeme(lexeme)
lexeme_words = get_lexeme_words(lexeme)

first_parse = morph.parse(lexeme_words[0])[0]
normal_form = (first_parse.word, first_parse.tag.POS)
Expand All @@ -142,6 +217,26 @@ def test_normalized_is_first(lexeme):
normalized = [(p.normalized.word, p.normalized.tag.POS) for p in parses]
assert normal_form in normalized


@pytest.mark.parametrize("lexeme", LEXEMES_FULL)
def test_full_lexemes(lexeme):
"""
Test that full lexemes are correct.
"""
forms = parse_full_lexeme(lexeme)
forms_lower = [(w.lower(), tag) for w, tag in forms]
for word, tag in forms:
assert_has_full_lexeme(word, forms_lower)


def assert_has_full_lexeme(word, forms):
for p in morph.parse(word):
lexeme_forms = [(f.word, str(f.tag)) for f in p.lexeme]
if lexeme_forms == forms:
return
raise AssertionError("Word %s doesn't have lexeme %s" % (word, forms))


def _lexemes_for_word(word):
res = []
for p in morph.parse(word):
Expand Down

0 comments on commit 8141ada

Please sign in to comment.