In [1]:
# %pip install russian-tagsets
# %pip install sklearn-crfsuite
# %pip install pymorphy2
# %pip install conllu
# %pip install scikit-learn==0.23.2

In [2]:
from russian_tagsets import converters
from pymorphy2 import MorphAnalyzer
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import CRF
import conllu

In [3]:
m = MorphAnalyzer()
to_ud = converters.converter('opencorpora-int', 'ud20')

In [4]:
ud_path = "Universal Dependencies 2.12/ud-treebanks-v2.12/ud-treebanks-v2.12/UD_Russian-SynTagRus"

In [5]:
infile = open(ud_path+"/ru_syntagrus-ud-train.conllu", "rt", encoding="UTF8")
train_conllu_iter = conllu.parse_incr(infile)

# Тренирую модель

In [26]:
def process_sentence(sent):
    result = []
    for token in sent:
        try:
            word = token['form']
        except:
            word = token
        parses = m.parse(word)
        top_3_parses = parses[:3]
        ud_parses = []
        for parse in top_3_parses:
            ud_parse = to_ud(str(parse.tag))
            parse_l = ud_parse.split()
            lemma = parse.normal_form
            upos = parse_l[0]
            feats = parse_l[1]
            ud_parses.append({'lemma': lemma, 'upos': upos, 'feats': feats})
        result.append((word, ud_parses))
    return result

In [8]:
def create_features(processed_sentence):
    result = []
    for pos in range(len(processed_sentence)):
        word_features = {}
        for j in range(-1, 2):
            shift = pos + j
            if 0 <= shift < len(processed_sentence):
                word, ud_parses = processed_sentence[shift]
                for k, parse in enumerate(ud_parses):
                    prefix = f'{j}_{k}_'
                    word_features[prefix + 'lemma'] = parse['lemma']
                    word_features[prefix + 'upos'] = parse['upos']
                    # word_features[prefix + 'feats'] = parse['feats']
        result.append(word_features)
    return result

In [9]:
X_train = []
y_train = []

In [10]:
for idx, sent in enumerate(train_conllu_iter):
    processed_sentence = process_sentence(sent)
    sent_features = create_features(processed_sentence)
    sent_labels = [token['upos'] for token in sent]
    X_train.append(sent_features)
    y_train.append(sent_labels)

In [11]:
X_train[0]

[{'0_0_lemma': 'анкета',
  '0_0_upos': 'NOUN',
  '1_0_lemma': '.',
  '1_0_upos': 'PUNCT'},
 {'-1_0_lemma': 'анкета',
  '-1_0_upos': 'NOUN',
  '0_0_lemma': '.',
  '0_0_upos': 'PUNCT'}]

In [12]:
y_train[0]

['NOUN', 'PUNCT']

In [13]:
testfile = open(ud_path+"/ru_syntagrus-ud-test.conllu", "rt", encoding="UTF8")
test_conllu_iter = conllu.parse_incr(testfile)

In [14]:
X_test, y_test = [], []

In [15]:
for idx, sent in enumerate(test_conllu_iter):
    processed_sentence = process_sentence(sent)
    sent_features = create_features(processed_sentence)
    sent_labels = [token['upos'] for token in sent]
    X_test.append(sent_features)
    y_test.append(sent_labels)

In [16]:
# Создаем экземпляр CRF модели
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [17]:
# for sent_features, sent_labels in zip(X_train, y_train):
#     for word_features, word_label in zip(sent_features, sent_labels):
#         if None in word_features.values() or word_label is None:
#             print("Found None value:", word_features, word_label)
#             break
#         break

In [18]:
# Обучаем модель
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [19]:
# Предсказываем метки для тестовых данных (пример)
y_pred = crf.predict(X_test)

In [20]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=crf.classes_, digits=3
))



              precision    recall  f1-score   support

        NOUN      0.972     0.987     0.979     36568
       PUNCT      1.000     1.000     1.000     29463
         ADJ      0.949     0.964     0.956     14471
       PROPN      0.931     0.812     0.867      5883
         AUX      0.935     0.943     0.939      1518
        VERB      0.984     0.984     0.984     18146
         ADP      0.999     0.999     0.999     15062
         ADV      0.940     0.948     0.944      8085
       CCONJ      0.962     0.978     0.969      5736
        PART      0.933     0.898     0.916      4921
        PRON      0.956     0.953     0.954      8015
         DET      0.928     0.897     0.912      4094
       SCONJ      0.899     0.948     0.923      2992
         NUM      0.946     0.931     0.938      2528
           _      1.000     1.000     1.000       271
        INTJ      0.824     0.609     0.700        23
           X      0.368     0.292     0.326        48
         SYM      0.994    

# Снятие омонимии

In [33]:
def disambiguate_sentence(sentence, crf):
    processed_sentence = process_sentence(sentence.split())
    sent_features = create_features(processed_sentence)
    predicted_labels = crf.predict_single(sent_features)
    result = []
    for word, label in zip(sentence.split(), predicted_labels):
        result.append((word, label))
    return result

In [34]:
sentence = "Мама мыла раму"

In [35]:
disambiguated_sentence = disambiguate_sentence(sentence, crf)
disambiguated_sentence

[('Мама', 'NOUN'), ('мыла', 'VERB'), ('раму', 'NOUN')]