Вариант 2. Написать pos-тегер (на основании pymorphy) внутри nltk

In [1]:
from nltk.tag import SequentialBackoffTagger
import nltk
import pymorphy2
import opencorpora
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
class MorphTagger(SequentialBackoffTagger):
    def __init__(self, *args, **kwargs):
        SequentialBackoffTagger.__init__(self, *args, **kwargs)
        self.morph = pymorphy2.MorphAnalyzer()
            
    def choose_tag(self, tokens, index, history):
        word = tokens[index]
        return self.morph.parse(word)[0].tag.POS

In [3]:
mt = MorphTagger()

In [4]:
mt.tag(['Это', 'тестовое', 'предложение'])

[('Это', 'PRCL'), ('тестовое', 'ADJF'), ('предложение', 'NOUN')]

Чтобы оценить точность тегера можно использовать корпус, на котором он основан (имея в виду, что на этих данных результаты, вероятно, будут искажены в большую сторону).

In [5]:
!opencorpora download

Creating annot.opcorpora.xml from http://opencorpora.org/files/export/annot/annot.opcorpora.xml.bz2
.............................................................................................................................
Done.


In [6]:
corpus = opencorpora.load('annot.opcorpora.xml')

In [7]:
def prepare_sent(sentence):
    result = []
    for token in sentence.tokens:
        result.append((token.source, token.parses[0].grammemes[0]))
    result = [token for token in result if token[1] != 'PNCT']
    return result

In [8]:
data = [prepare_sent(sentence) for sentence in corpus.sentences[:1000]]

In [9]:
mt.evaluate(data)

0.9032584738190266

In [10]:
test_tagged = mt.tag_sents([[token for token, tag in sent] for sent in data])
gold = [str(tag) for sentence in data for token, tag in sentence]
pred = [str(tag) for sentence in test_tagged for token, tag in sentence]

In [12]:
print(classification_report(gold, pred))

              precision    recall  f1-score   support

        ADJF       0.95      0.90      0.93      2782
        ADJS       0.68      0.65      0.67       112
        ADVB       0.77      0.87      0.82       625
        COMP       0.73      0.77      0.75        61
        CONJ       0.93      0.90      0.91      1506
        GRND       0.97      0.95      0.96        62
        INFN       0.99      0.99      0.99       383
        INTJ       0.08      0.14      0.10        22
        LATN       0.00      0.00      0.00        99
        NOUN       0.94      0.98      0.96      5979
        NPRO       0.76      0.84      0.80       347
        NUMB       0.00      0.00      0.00       224
        NUMR       0.97      0.70      0.81        56
        None       0.00      0.00      0.00         0
        PRCL       0.68      0.82      0.75       495
        PRED       0.92      0.93      0.92        59
        PREP       0.98      0.99      0.99      1977
        PRTF       0.93    