diff --git a/pymorphy2/analyzer.py b/pymorphy2/analyzer.py index 3455253..76e9f34 100644 --- a/pymorphy2/analyzer.py +++ b/pymorphy2/analyzer.py @@ -143,17 +143,27 @@ class MorphAnalyzer(object): ENV_VARIABLE = 'PYMORPHY2_DICT_PATH' DEFAULT_UNITS = [ - units.DictionaryAnalyzer, + [ + units.DictionaryAnalyzer, + units.AbbreviatedFirstNameAnalyzer, + units.AbbreviatedPatronymicAnalyzer, + ], units.NumberAnalyzer, units.PunctuationAnalyzer, - [units.RomanNumberAnalyzer, units.LatinAnalyzer], + [ + units.RomanNumberAnalyzer, + units.LatinAnalyzer + ], units.HyphenSeparatedParticleAnalyzer, units.HyphenAdverbAnalyzer, units.HyphenatedWordsAnalyzer, units.KnownPrefixAnalyzer, - [units.UnknownPrefixAnalyzer, units.KnownSuffixAnalyzer], + [ + units.UnknownPrefixAnalyzer, + units.KnownSuffixAnalyzer + ], ] def __init__(self, path=None, result_type=Parse, units=None, diff --git a/pymorphy2/units/__init__.py b/pymorphy2/units/__init__.py index f8d8646..1730675 100644 --- a/pymorphy2/units/__init__.py +++ b/pymorphy2/units/__init__.py @@ -18,3 +18,7 @@ NumberAnalyzer, RomanNumberAnalyzer ) +from .abbreviations import ( + AbbreviatedFirstNameAnalyzer, + AbbreviatedPatronymicAnalyzer +) diff --git a/pymorphy2/units/abbreviations.py b/pymorphy2/units/abbreviations.py new file mode 100644 index 0000000..16152f2 --- /dev/null +++ b/pymorphy2/units/abbreviations.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- +""" +Analyzer units for abbreviated words +------------------------------------ +""" +from __future__ import absolute_import, unicode_literals, division +from pymorphy2.units.base import BaseAnalyzerUnit + + +class _InitialsAnalyzer(BaseAnalyzerUnit): + SCORE = 0.1 + LETTERS = set('АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЭЮЯ') + TAG_PATTERN = None + + def __init__(self, morph): + super(_InitialsAnalyzer, self).__init__(morph) + if self.TAG_PATTERN is None: + raise ValueError("Define TAG_PATTERN in a subclass") + self._tags = self._get_gender_case_tags(self.TAG_PATTERN) + + def _get_gender_case_tags(self, pattern): + return [ + self.morph.TagClass(pattern % {'gender': gender, 'case': case}) + for gender in ['masc', 'femn'] + for case in ['nomn', 'gent', 'datv', 'accs', 'ablt', 'loct'] + ] + + def parse(self, word, word_lower, seen_parses): + if word not in self.LETTERS: + return [] + return [ + (word_lower, tag, word_lower, self.SCORE, ((self, word),)) + for tag in self._tags + ] + + def tag(self, word, word_lower, seen_tags): + if word not in self.LETTERS: + return [] + return self._tags[:] + + +class AbbreviatedFirstNameAnalyzer(_InitialsAnalyzer): + TAG_PATTERN = 'NOUN,anim,%(gender)s,Sgtm,Name,Fixd,Abbr sing,%(case)s' + + def __init__(self, morph): + super(AbbreviatedFirstNameAnalyzer, self).__init__(morph) + self._tags_masc = [tag for tag in self._tags if 'masc' in tag] + self._tags_femn = [tag for tag in self._tags if 'femn' in tag] + assert self._tags_masc + self._tags_femn == self._tags + + def get_lexeme(self, form): + # 2 lexemes: masc and femn + fixed_word, form_tag, normal_form, score, methods_stack = form + tags = self._tags_masc if 'masc' in form_tag else self._tags_femn + return [ + (fixed_word, tag, normal_form, score, methods_stack) + for tag in tags + ] + + def normalized(self, form): + # don't normalize female names to male names + fixed_word, form_tag, normal_form, score, methods_stack = form + tags = self._tags_masc if 'masc' in form_tag else self._tags_femn + return fixed_word, tags[0], normal_form, score, methods_stack + + +class AbbreviatedPatronymicAnalyzer(_InitialsAnalyzer): + TAG_PATTERN = 'NOUN,anim,%(gender)s,Sgtm,Patr,Fixd,Abbr sing,%(case)s' + + def get_lexeme(self, form): + fixed_word, _, normal_form, score, methods_stack = form + return [ + (fixed_word, tag, normal_form, score, methods_stack) + for tag in self._tags + ] + + def normalized(self, form): + fixed_word, _, normal_form, score, methods_stack = form + return fixed_word, self._tags[0], normal_form, score, methods_stack diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py index d5a52a6..b5ae5be 100644 --- a/tests/test_analyzer.py +++ b/tests/test_analyzer.py @@ -272,3 +272,55 @@ class TetsPunctuationPredictor: def test_tag(self): assert morph.tag('…') == [morph.TagClass('PNCT')] + +class TestInitials: + + def assertHasFirstName(self, tags): + assert any(set(['Name', 'Abbr']) in tag for tag in tags), tags + + def assertHasPatronymic(self, tags): + assert any(set(['Patr', 'Abbr']) in tag for tag in tags), tags + + def _filter_parse(self, word, grammemes): + return [p for p in morph.parse(word) if set(grammemes) in p.tag] + + def test_tag(self): + tags = morph.tag('Д') + self.assertHasFirstName(tags) + self.assertHasPatronymic(tags) + + def test_tag_conj(self): + tags = morph.tag('И') + self.assertHasFirstName(tags) + self.assertHasPatronymic(tags) + assert any('CONJ' in tag for tag in tags), tags + + def test_parse(self): + tags = [p.tag for p in morph.parse('И')] + self.assertHasFirstName(tags) + self.assertHasPatronymic(tags) + + def test_normalize_name_masc(self): + parse = self._filter_parse('И', ['Name', 'accs', 'masc'])[0] + assert parse.normalized.word == 'и' + assert parse.normalized.tag.case == 'nomn' + assert parse.normalized.tag.gender == 'masc' + + def test_normalize_patr_masc(self): + parse = self._filter_parse('И', ['Patr', 'accs', 'masc'])[0] + assert parse.normalized.word == 'и' + assert parse.normalized.tag.case == 'nomn' + assert parse.normalized.tag.gender == 'masc' + + def test_normalize_name_femn(self): + parse = self._filter_parse('И', ['Name', 'accs', 'femn'])[0] + assert parse.normalized.word == 'и' + assert parse.normalized.tag.case == 'nomn' + assert parse.normalized.tag.gender == 'femn' + + def test_normalize_patr_femn(self): + parse = self._filter_parse('И', ['Patr', 'accs', 'femn'])[0] + assert parse.normalized.word == 'и' + assert parse.normalized.tag.case == 'nomn' + assert parse.normalized.tag.gender == 'masc' + diff --git a/tests/test_lexemes.py b/tests/test_lexemes.py index a6bd024..aeec736 100644 --- a/tests/test_lexemes.py +++ b/tests/test_lexemes.py @@ -3,10 +3,31 @@ import pytest from .utils import morph + # lexemes are divided by blank lines; # lines that starts with "#" are comments; # lines that starts with "XFAIL" excludes lexeme from testing. -LEXEMES = """ + +def parse_lexemes(lexemes_txt): + lexemes_txt = "".join( + line for line in lexemes_txt.strip().splitlines(True) + if not line.startswith("#") + ) + return lexemes_txt.split("\n\n") + +def get_lexeme_words(lexeme): + lexeme_words = tuple(lexeme.split()) + if lexeme_words[0].startswith('XFAIL'): + pytest.xfail() + return lexeme_words + + +def parse_full_lexeme(lexeme): + forms = lexeme.strip().splitlines() + return [form.split(None, 1) for form in forms] + + +LEXEMES = parse_lexemes(""" # =========== noun кот кота коту кота котом коте коты котов котам котов котами котах @@ -27,7 +48,7 @@ экс-лжекот экс-лжекота экс-лжекоту экс-лжекота экс-лжекотом экс-лжекоте экс-лжекоты экс-лжекотов экс-лжекотам экс-лжекотов экс-лжекотами экс-лжекотах -# =========== noun with two knoen prefixes +# =========== noun with two known prefixes экслжекот экслжекота экслжекоту экслжекота экслжекотом экслжекоте экслжекоты экслжекотов экслжекотам экслжекотов экслжекотами экслжекотах @@ -88,16 +109,70 @@ лес-колдун леса-колдуна лесу-колдуну лес-колдуна лесом-колдуном лесе-колдуне леса-колдуны лесов-колдунов лесам-колдунам леса-колдунов лесами-колдунами лесах-колдунах -""".strip() +""") -LEXEMES = "".join(l for l in LEXEMES.splitlines(True) if not l.startswith("#")).split("\n\n") +LEXEMES_FULL = parse_lexemes(""" +# ============ noun, a sanity check +кот NOUN,anim,masc sing,nomn +кота NOUN,anim,masc sing,gent +коту NOUN,anim,masc sing,datv +кота NOUN,anim,masc sing,accs +котом NOUN,anim,masc sing,ablt +коте NOUN,anim,masc sing,loct +коты NOUN,anim,masc plur,nomn +котов NOUN,anim,masc plur,gent +котам NOUN,anim,masc plur,datv +котов NOUN,anim,masc plur,accs +котами NOUN,anim,masc plur,ablt +котах NOUN,anim,masc plur,loct + +# =========== adverb +театрально ADVB + +по-театральному ADVB + +# =========== pronoun with a particle +он-то NPRO,masc,3per,Anph sing,nomn +его-то NPRO,masc,3per,Anph sing,gent +него-то NPRO,masc,3per,Anph sing,gent,Af-p +ему-то NPRO,masc,3per,Anph sing,datv +нему-то NPRO,masc,3per,Anph sing,datv,Af-p +его-то NPRO,masc,3per,Anph sing,accs +него-то NPRO,masc,3per,Anph sing,accs,Af-p +им-то NPRO,masc,3per,Anph sing,ablt +ним-то NPRO,masc,3per,Anph sing,ablt,Af-p +нём-то NPRO,masc,3per,Anph sing,loct,Af-p + +# ========== initials +И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,nomn +И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,gent +И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,datv +И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,accs +И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,ablt +И NOUN,anim,masc,Sgtm,Name,Fixd,Abbr sing,loct + +И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,nomn +И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,gent +И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,datv +И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,accs +И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,ablt +И NOUN,anim,femn,Sgtm,Name,Fixd,Abbr sing,loct + +И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,nomn +И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,gent +И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,datv +И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,accs +И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,ablt +И NOUN,anim,masc,Sgtm,Patr,Fixd,Abbr sing,loct +И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,nomn +И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,gent +И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,datv +И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,accs +И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,ablt +И NOUN,anim,femn,Sgtm,Patr,Fixd,Abbr sing,loct +""") -def _parse_lexeme(lexeme): - lexeme_words = tuple(lexeme.split()) - if lexeme_words[0].startswith('XFAIL'): - pytest.xfail() - return lexeme_words # ============ Tests: @@ -105,9 +180,9 @@ def _parse_lexeme(lexeme): @pytest.mark.parametrize("lexeme", LEXEMES) def test_has_proper_lexemes(lexeme): """ - Check if the lexeme of first word in the lexeme is the same lexeme. + Check if the lexeme of the first word in the lexeme is the same lexeme. """ - lexeme_words = _parse_lexeme(lexeme) + lexeme_words = get_lexeme_words(lexeme) variants = _lexemes_for_word(lexeme_words[0]) if lexeme_words not in variants: @@ -120,7 +195,7 @@ def test_lexemes_sanity(lexeme): """ Check if parse.lexeme works properly by applying it several times. """ - lexeme_words = _parse_lexeme(lexeme) + lexeme_words = get_lexeme_words(lexeme) for word in lexeme_words: for p in morph.parse(word): @@ -132,7 +207,7 @@ def test_normalized_is_first(lexeme): """ Test that parse.normalized is a first form in lexeme. """ - lexeme_words = _parse_lexeme(lexeme) + lexeme_words = get_lexeme_words(lexeme) first_parse = morph.parse(lexeme_words[0])[0] normal_form = (first_parse.word, first_parse.tag.POS) @@ -142,6 +217,26 @@ def test_normalized_is_first(lexeme): normalized = [(p.normalized.word, p.normalized.tag.POS) for p in parses] assert normal_form in normalized + +@pytest.mark.parametrize("lexeme", LEXEMES_FULL) +def test_full_lexemes(lexeme): + """ + Test that full lexemes are correct. + """ + forms = parse_full_lexeme(lexeme) + forms_lower = [(w.lower(), tag) for w, tag in forms] + for word, tag in forms: + assert_has_full_lexeme(word, forms_lower) + + +def assert_has_full_lexeme(word, forms): + for p in morph.parse(word): + lexeme_forms = [(f.word, str(f.tag)) for f in p.lexeme] + if lexeme_forms == forms: + return + raise AssertionError("Word %s doesn't have lexeme %s" % (word, forms)) + + def _lexemes_for_word(word): res = [] for p in morph.parse(word):