In [1]:
import re
import pymystem3
import pymorphy2
import numpy as np
import opencorpora

from collections import defaultdict
from abc import ABCMeta, abstractmethod
from operator import itemgetter, attrgetter

# Opencorpora corpus

In [2]:
corpus = opencorpora.load('../annot.opcorpora.xml')

# Analyzers

In [3]:
class Analyzer(metaclass=ABCMeta):
    @abstractmethod
    def parse(sentence):
        pass

## Mystem

In [4]:
class MystemAnalyzer(Analyzer):
    POS = {
        'A': ['ADJF', 'ADJS'],
        'ADV': ['ADVB'],
        'ADVPRO': ['ADVB'],
        'ANUM': ['ADJF', 'ADJS'],
        'APRO': ['ADJF', 'ADJS'],
        'COM': [],
        'CONJ': ['CONJ'],
        'INTJ': ['INTJ'],
        'NUM': ['NUMR'],
        'PART': ['PRCL'],
        'PR': ['PREP'],
        'S': ['NOUN'],
        'SPRO': ['NPRO'],
        'V': ['VERB', 'INFN']
    }
    
    def __init__(self):
        self._analyzer = pymystem3.Mystem()
        self._re_pos = re.compile('[a-zA-Z_]*')
    
    @staticmethod
    def _not_empty(words):
        return list(filter(lambda it: 'analysis' in it, words))
    
    @staticmethod
    def _get_parse(words):
        return zip(map(itemgetter('text'), words), 
                   map(lambda it: it['analysis'][0] if it['analysis'] else None, words))
            
    def _get_pos(self, parse):
        return self.POS[self._re_pos.match(parse['gr']).group(0)]
    
    @staticmethod
    def _get_lexema(parse):
        return parse['lex']
    
    def parse(self, sentence):
        parses = []

        for text, parse in self._get_parse(self._not_empty(self._analyzer.analyze(sentence))):
            parses.append((self._get_lexema(parse) if parse else text, self._get_pos(parse) if parse else []))
            
        return parses

## Morph

In [5]:
class MorphAnalyzer(Analyzer):
    def __init__(self):
        self._analyzer = pymorphy2.MorphAnalyzer()
    
    @staticmethod
    def _not_empty(words):
        return filter(lambda it: 'analysis' in it, words)
    
    def parse(self, sentence):
        parses = []
        
        for parse in map(lambda it: self._analyzer.parse(it)[0], sentence.split()):
            parses.append((parse.normal_form, [parse.tag.POS]))
            
        return parses

# Evaluation

In [6]:
def process_sentence(sentence):
    tokens = list(filter(lambda it: 'PNCT' not in it.parse.grammemes, sentence.tokens))
    words = list(map(attrgetter('source'), tokens))
    parses = list(map(lambda it: (it.parse.lemma, it.parse.grammemes[0]), tokens))
    return words, parses

def sentences(docs):
    for doc in docs:
        for sentence in doc.sentences:
            yield sentence

def evaluate(docs, analyzers):
    words_count = 0
    lemma_precision = defaultdict(int) 
    pos_precision = defaultdict(int)
    
    for sentence in sentences(docs):
        words, parses = process_sentence(sentence)
        sentence_clear = ' '.join(words).lower()
        
        words_count += len(parses)
        
        for analyzer in analyzers:
            
            for parse_true, parse_pred in zip(parses, analyzer.parse(sentence_clear)):
                if parse_true[0] == parse_pred[0]:
                    lemma_precision[analyzer.__class__.__name__] += 1
                
                if parse_true[1] in parse_pred[1]:
                    pos_precision[analyzer.__class__.__name__] += 1
                    
    for key in lemma_precision:
        lemma_precision[key] /= words_count
        
    for key in pos_precision:
        pos_precision[key] /= words_count
    
    return lemma_precision, pos_precision

In [7]:
docs = np.random.choice(corpus.docs, size=200, replace=False)

In [8]:
analyzers = [MystemAnalyzer(), MorphAnalyzer()]
lemma_precision, pos_precision = evaluate(docs, analyzers)

In [9]:
print('Lemma precision:')
for analyzer in analyzers:
    name = analyzer.__class__.__name__
    print('{}: {:0.1f}%'.format(name, 100 * lemma_precision[name]))

Lemma precision:
MystemAnalyzer: 63.2%
MorphAnalyzer: 81.5%


In [10]:
print('POS precision:')
for analyzer in analyzers:
    name = analyzer.__class__.__name__
    print('{}: {:0.1f}%'.format(name, 100 * pos_precision[name]))

POS precision:
MystemAnalyzer: 72.3%
MorphAnalyzer: 87.3%
