This solution is based on the notebook by IlyaGusev:
https://github.com/IlyaGusev/nlp-practice/blob/master/rupos.ipynb

In [1]:
from tqdm import tqdm

In [2]:
# Имена файлов с данными.
from os.path import join
PATH_DATA = 'data'
TRAIN_FILENAME = join(PATH_DATA, 'train.csv')
TEST_FILENAME = join(PATH_DATA, 'test.csv')

In [3]:
# Считывание файлов.
from collections import namedtuple
WordForm = namedtuple('WordForm', 'word pos gram')

def get_sentences(filename, is_train):
    with open(filename, 'r', encoding='utf-8') as r:
        sentence = []
        for line in r:
            line = line.strip()
            if len(line) == 0:
                if len(sentence) > 0:
                    yield sentence
                    sentence = []
                continue
            if is_train:
                line_parts = line.split('\t')
                word = line_parts[2]
                pos, gram = line_parts[3].split('#')
                sentence.append(WordForm(word, pos, gram))
            else:
                word = line.split('\t')[2]
                sentence.append(word)
        if len(sentence) != 0:
            yield sentence

In [4]:
train = list(get_sentences(TRAIN_FILENAME, True))
test = list(get_sentences(TEST_FILENAME, False))

In [5]:
# Класс для удобной векторизации грамматических значений.
import jsonpickle
import os
from collections import defaultdict
from typing import Dict, List, Set

def process_gram_tag(gram: str):
    gram = gram.strip().split('|')
    return '|'.join(sorted(gram))


def get_empty_category():
    return {GrammemeVectorizer.UNKNOWN_VALUE}


class GrammemeVectorizer(object):
    UNKNOWN_VALUE = 'Unknown'

    def __init__(self, dump_filename: str):
        self.all_grammemes = defaultdict(get_empty_category)  # type: Dict[str, Set]
        self.vectors = []  # type: List[List[int]]
        self.name_to_index = {}  # type: Dict[str, int]
        self.dump_filename = dump_filename  # type: str
        if os.path.exists(self.dump_filename):
            self.load()

    def add_grammemes(self, pos_tag: str, gram: str) -> int:
        gram = process_gram_tag(gram)
        vector_name = pos_tag + '#' + gram
        if vector_name not in self.name_to_index:
            self.name_to_index[vector_name] = len(self.name_to_index)
            self.all_grammemes['POS'].add(pos_tag)
            gram = gram.split('|') if gram != '_' else []
            for grammeme in gram:
                category, value = grammeme.split('=')
                self.all_grammemes[category].add(value)
        return self.name_to_index[vector_name]

    def init_possible_vectors(self) -> None:
        self.vectors = []
        for grammar_val, index in sorted(self.name_to_index.items(), key=lambda x: x[1]):
            pos_tag, grammemes = grammar_val.split('#')
            grammemes = grammemes.split('|') if grammemes != '_' else []
            vector = self.__build_vector(pos_tag, grammemes)
            self.vectors.append(vector)

    def get_vector(self, vector_name: str) -> List[int]:
        if vector_name not in self.name_to_index:
            return [0] * len(self.vectors[0])
        return self.vectors[self.name_to_index[vector_name]]

    def get_vector_by_index(self, index: int) -> List[int]:
        return self.vectors[index] if 0 <= index < len(self.vectors) else [0] * len(self.vectors[0])

    def get_ordered_grammemes(self) -> List[str]:
        flat = []
        sorted_grammemes = sorted(self.all_grammemes.items(), key=lambda x: x[0])
        for category, values in sorted_grammemes:
            for value in sorted(list(values)):
                flat.append(category + '=' + value)
        return flat
    
    def save(self) -> None:
        with open(self.dump_filename, 'w') as f:
            f.write(jsonpickle.encode(self, f))

    def load(self):
        with open(self.dump_filename, 'r') as f:
            vectorizer = jsonpickle.decode(f.read())
            self.__dict__.update(vectorizer.__dict__)

    def size(self) -> int:
        return len(self.vectors)

    def grammemes_count(self) -> int:
        return len(self.get_ordered_grammemes())

    def is_empty(self) -> int:
        return len(self.vectors) == 0

    def get_name_by_index(self, index):
        d = {index: name for name, index in self.name_to_index.items()}
        return d[index]

    def get_index_by_name(self, name):
        pos = name.split('#')[0]
        gram = process_gram_tag(name.split('#')[1])
        return self.name_to_index[pos + '#' + gram]

    def __build_vector(self, pos_tag: str, grammemes: List[str]) -> List[int]:
        vector = []
        gram_tags = {pair.split('=')[0]: pair.split('=')[1] for pair in grammemes}
        gram_tags['POS'] = pos_tag
        sorted_grammemes = sorted(self.all_grammemes.items(), key=lambda x: x[0])
        for category, values in sorted_grammemes:
            value_correct = gram_tags[category] if category in gram_tags else GrammemeVectorizer.UNKNOWN_VALUE
            vector.extend(1 if value == value_correct else 0 for value in sorted(list(values)))
        return vector

In [6]:
from pymorphy2 import MorphAnalyzer
from russian_tagsets import converters

morph = MorphAnalyzer()
to_ud = converters.converter('opencorpora-int', 'ud14')

def convert_from_opencorpora_tag(tag, text):
    ud_tag = to_ud(str(tag), text)
    pos, gram = ud_tag.split()
    return pos, gram

def fill_all_variants(word, vectorizer):
    for parse in morph.parse(word):
        pos, gram = convert_from_opencorpora_tag(parse.tag, parse.word)
        vectorizer.add_grammemes(pos, gram)

vectorizer = GrammemeVectorizer('vectorizer.json')
if vectorizer.is_empty():
    print('Add train sentences to vectorizer')
    for sentence in tqdm(train):
        for form in sentence:
            fill_all_variants(form.word, vectorizer)
    print('Add test sentences to vectorizer')
    for sentence in tqdm(test):
        for word in sentence:
            fill_all_variants(word, vectorizer)
    print('Init vectors in vectorizer')
    vectorizer.init_possible_vectors()
    vectorizer.save()

In [7]:
vectorizer_output = GrammemeVectorizer('vectorizer_output.json')
if vectorizer_output.is_empty():
    for sentence in tqdm(train):
        for form in sentence:
            vectorizer_output.add_grammemes(form.pos, gram)
    vectorizer_output.init_possible_vectors()
    vectorizer_output.save()

In [8]:
# Получение признаков для конкретного контекста.
def get_context_features(i, parse_sentence, context_len):
    sample = []
    left = i - (context_len - 1) // 2
    right = i + context_len // 2
    if left < 0:
        sample.extend(0 for _ in range(vectorizer.grammemes_count() * (-left)))
    for parse in parse_sentence[max(left, 0): min(right + 1, len(sentence))]:
        pos, gram = convert_from_opencorpora_tag(parse.tag, parse.word)
        gram = process_gram_tag(gram)
        sample.extend(vectorizer.get_vector(pos + '#' + gram))
    if right > len(sentence) - 1:
        sample.extend(0 for _ in range(vectorizer.grammemes_count() * (right - len(sentence) + 1)))
    assert len(sample) == context_len * vectorizer.grammemes_count()
    return sample

In [9]:
# Загрузка обучающей выборки.
from collections import Counter
import numpy as np
import os

context_len = 5

TRAIN_SAMPLES_PATH = 'samples.npy'
ANSWERS_PATH = 'answers.npy'
if not os.path.exists(TRAIN_SAMPLES_PATH) or not os.path.exists(ANSWERS_PATH):
    n = sum([1 for sentence in train for word in sentence])
    samples = np.zeros((n, context_len * vectorizer.grammemes_count()), dtype='bool_')
    answers = np.zeros((n, ), dtype='int')
    index = 0
    for sentence in tqdm(train):
        parse_sentence = [morph.parse(form.word)[0] for form in sentence]
        for i, form in enumerate(sentence):
            samples[index] = get_context_features(i, parse_sentence, context_len)
            gram = process_gram_tag(form.gram)
            answers[index] = vectorizer_output.get_index_by_name(form.pos + '#' + gram)
            index += 1
    np.save(TRAIN_SAMPLES_PATH, samples)
    np.save(ANSWERS_PATH, answers)
else:
    samples = np.load(TRAIN_SAMPLES_PATH)
    answers = np.load(ANSWERS_PATH)
class_count = len(np.unique(answers))
class_distribution = Counter(answers)

In [10]:
print(f'samples.shape = {samples.shape}')
print(f'answers.shape = {answers.shape}')
print(f'class_count = {class_count}')
print(class_distribution)

samples.shape = (850689, 310)
answers.shape = (850689,)
class_count = 581
Counter({9: 156227, 2: 78937, 5: 41767, 0: 30474, 1: 26688, 11: 16517, 30: 14928, 12: 14702, 38: 14560, 94: 11774, 49: 11393, 25: 10240, 3: 10017, 7: 9993, 54: 9489, 40: 9195, 20: 8873, 36: 8850, 53: 7927, 22: 7409, 47: 7359, 33: 7288, 88: 7044, 14: 7029, 17: 6773, 16: 6742, 8: 6735, 67: 6499, 55: 6332, 50: 6272, 23: 5806, 45: 5782, 69: 5716, 95: 5447, 52: 4986, 58: 4941, 26: 4327, 64: 4205, 19: 4101, 29: 3925, 72: 3819, 141: 3819, 18: 3808, 44: 3667, 162: 3655, 48: 3639, 4: 3512, 27: 3281, 119: 3262, 175: 3259, 106: 3201, 85: 3182, 37: 3176, 34: 3130, 32: 3127, 144: 2908, 90: 2806, 51: 2804, 184: 2597, 149: 2554, 131: 2551, 70: 2503, 122: 2501, 78: 2498, 128: 2489, 43: 2482, 99: 2365, 93: 2271, 142: 2257, 75: 2242, 82: 2240, 42: 2227, 83: 2126, 21: 2097, 60: 2094, 115: 2069, 86: 2034, 146: 1968, 98: 1882, 136: 1849, 41: 1842, 76: 1829, 143: 1715, 193: 1698, 73: 1671, 28: 1662, 191: 1615, 84: 1596, 102: 1515, 126

In [11]:
# # Oversampling
# CLASS_LIMIT = 1500

# pairs_sorted = sorted(zip(samples, answers), key=lambda pair: pair[1])
# samples_balanced, answers_balanced = [], []
# answer_previous = -1
# for sample, answer in tqdm(pairs_sorted):
#     if answer != answer_previous:
#         answer_previous = answer
#         count_without_repeat = min(CLASS_LIMIT, class_distribution[answer])
#         count_repeat = CLASS_LIMIT // count_without_repeat
#         count_remainder = CLASS_LIMIT % count_without_repeat
#         entry_count = 0
#     if entry_count < CLASS_LIMIT:
#         repeats = count_repeat
#         if count_remainder > 0:
#             count_remainder -= 1
#             repeats += 1
#         samples_balanced.extend([sample] * repeats)
#         answers_balanced.extend([answer] * repeats)
#         entry_count += repeats
# samples, answers = np.array(samples_balanced), np.array(answers_balanced)
# indices = np.random.permutation(samples.shape[0])
# samples, answers = samples[indices], answers[indices]
# print(samples.shape)
# print(answers.shape)

In [12]:
# Выбор классификатора
from keras.models import Model
from keras.layers import Dense, Dropout, Input
from keras.regularizers import l2

inp = Input(shape=(samples.shape[1],))
x = Dense(1024, activation='relu')(inp)
x = Dense(512, activation='relu')(x)
x = Dropout(0.3)(x)

x = Dense(512, activation='relu')(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.1)(x)

x = Dense(class_count, activation='softmax')(x)

clf = Model(inputs=[inp], outputs=[x])

clf.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

clf.summary()

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 310)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              318464    
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               262656    
_________________________________________________________________
dense_4 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
__________

In [13]:
# Загрузка тестовой выборки
TEST_SAMPLES_PATH = 'test_samples.npy'
ANSWERS_PATH = 'answers.npy'
if not os.path.exists(TEST_SAMPLES_PATH):
    n = sum([1 for sentence in test for word in sentence])
    test_samples = np.zeros((n, context_len * vectorizer.grammemes_count()), dtype='bool_')
    index = 0
    for sentence in tqdm(test):
        parse_sentence = [morph.parse(word)[0] for word in sentence]
        for i, word in enumerate(sentence):
            test_samples[index] = get_context_features(i, parse_sentence, context_len)
            index += 1
    np.save(TEST_SAMPLES_PATH, test_samples)
else:
    test_samples = np.load(TEST_SAMPLES_PATH)

In [14]:
from keras.utils import to_categorical
import itertools

X, y = samples.astype(np.float64), to_categorical(answers, num_classes=class_count)


def repeat_func(func, times=None, *args):
    if times is None:
        return itertools.starmap(func, itertools.repeat(args))
    return itertools.starmap(func, itertools.repeat(args, times))


def repeat_infinitely(func, *args):
    return itertools.chain.from_iterable(repeat_func(func, None, *args))


def collect_batches(gen, batch_size=32, randomize=True, probability=0.5):
    while True:
        batch_x, batch_y = [], []
        while len(batch_x) < batch_size:
            x, y = next(gen)
            if randomize and np.random.rand() < probability or not randomize:
                batch_x.append(x)
                batch_y.append(y)
        yield np.array(batch_x), np.array(batch_y)


def init_data_generator(xs, ys):
    gen = repeat_infinitely(zip, xs, ys)
    return collect_batches(gen, randomize=False)

gen_data = init_data_generator(X, y)

In [15]:
# Обучение классификатора.

from keras.callbacks import ReduceLROnPlateau, LambdaCallback

clf.fit_generator(gen_data,
                  steps_per_epoch=10000,
                  epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1fb5f6b6320>

In [16]:
# Предсказания.
test_samples_float = test_samples.astype(np.float64)

answers = []
batch_size = 1000
n_batches = len(test_samples) // batch_size
for i in range(n_batches):
    predictions = clf.predict(test_samples_float[i * batch_size: i * batch_size + batch_size])
    predictions = list(clf.predict(test_samples_float[i * batch_size: i * batch_size + batch_size]))
    answers.extend(list(map(np.argmax, predictions)))
predictions = list(clf.predict(test_samples_float[n_batches * batch_size:]))
answers.extend(list(map(np.argmax, predictions)))

print(len(answers))
print(answers[0])

217794
0


In [17]:
# Сохранение посылки
with open('subm.csv', 'w') as f: 
    f.write('Id,Prediction\n')
    for index, answer in enumerate(answers):
        f.write(str(index) + ',' + vectorizer_output.get_name_by_index(answer) + '\n')