In [72]:
import numpy as np
import pandas as pd
import re
import tqdm
import matplotlib.pyplot as plt

import os
from pathlib import Path
import gensim.downloader
import pymorphy3

In [73]:
df_poems = pd.read_csv('../data/poems_dataset.csv')
df_poems

Unnamed: 0,author,epoch,title,part,text
0,Агнивцев,символизм,брат антонио,0,"В монастырской тихой келье,\nПозабывши о весел..."
1,Агнивцев,символизм,грузовик 1317,0,"Весь машинный свой век, каждый день по утрам\n..."
2,Агнивцев,символизм,грузовик 1317,1,"Грузовик № 1317.\nНо открылись фронты! О, услы..."
3,Агнивцев,символизм,мэри пикфорд,0,"В Америке где-то\nСудя по газетам,\nЕсть город..."
4,Агнивцев,символизм,бильбокэ,0,"К дофину Франции, в печали,\nСкользнув тайком,..."
...,...,...,...,...,...
54232,Яшин,соцреализм,зеркальце,2,"В руки зеркальце взяла\nИ сказала:\n«Удружи,\n..."
54233,Яшин,соцреализм,очень много солнечного света,0,"Очень много солнечного света,\nНад землей стои..."
54234,Яшин,соцреализм,огонек,0,"Светлячок во мгле —\nОгонек в лесах.\nМожет, о..."
54235,Яшин,соцреализм,назови меня именем светлым,0,"Назови меня именем светлым,\nЧистым именем наз..."


In [260]:
stopwords = nltk.corpus.stopwords.words('russian')
# print(stopwords)

In [261]:
os.environ['GENSIM_DATA_DIR'] = str(Path.cwd())
gensim_embeddings = gensim.downloader.load('word2vec-ruscorpora-300')
gensim_words = np.array([
    element.split('_') for element in gensim_embeddings.index_to_key
])
np.unique(gensim_words[:, 1])

array(['ADJ', 'ADP', 'ADV', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART',
       'PRON', 'VERB'], dtype='<U49')

In [317]:
morph = pymorphy3.MorphAnalyzer()
tags_to_transform = {
    'ADJF': 'ADJ',
    'ADJS': 'ADJ',
    'COMP': 'ADJ',
    'INFN': 'VERB',
    'PRTF': 'VERB',
    'PRTS': 'VERB',
    'GRND': 'VERB',
    'NUMR': 'NUM',
    'ADVB': 'ADV',
    'PREP': 'ADP',
    'NPRO': 'PRON',
    'PRCL': 'PART',
    'CONJ': 'CCONJ',
}

def morph_parse(word):
    normal_form = morph.parse(word)[0].normal_form
    normal_form = re.sub('ё', 'е', normal_form)
    part_tag = re.findall(r'\w+', str(morph.parse(word)[0].tag))[0]
    if part_tag in tags_to_transform:
        part_tag = tags_to_transform[part_tag]
    return normal_form + '_' + part_tag

def get_embedding_tagged(tagged):
    if tagged in gensim_embeddings:
        return True, gensim_embeddings[tagged]
    else:
        return False, np.zeros(300)

def get_embedding(word):
    tagged = morph_parse(word)
    return get_embedding_tagged(tagged)

In [318]:
(get_embedding('стол')[1] * get_embedding('стул')[1]).sum()

0.70359695

In [395]:
class WordsEncoder:
    def __init__(self):
        self._vocab = dict()
        self._vocab_occurances = dict()
        self._no_embedding_occurances = dict()

    def split(text):
        return re.findall(r'[а-яё]+', text.lower())
    
    def add_to_vocab(self, texts):
        for text in tqdm.tqdm(texts):
            for word in WordsEncoder.split(text):
                tagged = morph_parse(word)
                if tagged in self._vocab_occurances:
                    self._vocab_occurances[tagged] += 1
                    continue
                if tagged in self._no_embedding_occurances:
                    self._no_embedding_occurances[tagged] += 1
                    continue
                if get_embedding_tagged(tagged)[0]:
                    self._vocab[tagged] = len(self._vocab)
                    self._vocab_occurances[tagged] = 1
                else:
                    self._no_embedding_occurances[tagged] = 1
    
    def info(self):
        words_vocab = len(self._vocab)
        words_total = words_vocab + len(self._no_embedding_occurances)
        print('Processed unique words:', words_total)
        print('Of them with embeddings:', words_vocab, '({:.2f} %)'.format(
            words_vocab / words_total * 100.0
        ))
        self.vocab_occurances_list = pd.DataFrame(
            list(self._vocab_occurances.items()),
            columns=['word', 'occurances']
        )
        self.vocab_occurances_list.sort_values('occurances', ascending=False, inplace=True)
        self.no_embedding_occurances_list = pd.DataFrame(
            list(self._no_embedding_occurances.items()),
            columns=['word', 'occurances']
        )
        self.no_embedding_occurances_list.sort_values('occurances', ascending=False, inplace=True)
        occurances_vocab = self.vocab_occurances_list['occurances'].sum()
        occurances_total = occurances_vocab + self.no_embedding_occurances_list['occurances'].sum()
        print('Processed word occurances:', occurances_total)
        print('Of them with embeddings:', occurances_vocab, '({:.2f} %)'.format(
            occurances_vocab / occurances_total * 100.0
        ))
    
    def to_embeddings(self, text):
        embeddings = []
        for word in WordsEncoder.split(text):
            tagged = morph_parse(word)
            if tagged in self._vocab:
                embeddings.append(get_embedding_tagged(tagged)[1])
        return np.array(embeddings)

In [396]:
words_encoder = WordsEncoder()
words_encoder.add_to_vocab(df_poems['text'][:1000])

100%|███████████████████████████████████████| 1000/1000 [00:11<00:00, 88.27it/s]


In [397]:
words_encoder.info()

Processed unique words: 10561
Of them with embeddings: 8514 (80.62 %)
Processed word occurances: 89165
Of them with embeddings: 46232 (51.85 %)


In [405]:
words_encoder.to_embeddings('человек собаке друг')

array([[-1.39451608e-01,  9.53103136e-03,  9.66905281e-02,
        -5.21333963e-02, -3.12726162e-02,  5.05528040e-02,
        -2.52584498e-02,  1.03141125e-02,  6.70733973e-02,
        -5.00828680e-03,  5.17660528e-02, -7.06560239e-02,
        -9.39404890e-02,  3.94431055e-02, -1.94523740e-03,
         6.24135835e-03,  3.44324149e-02, -6.78158645e-03,
         9.03536752e-02, -8.52523595e-02,  6.77594543e-03,
         7.92675912e-02, -4.68349978e-02,  1.66406110e-01,
         5.13189882e-02, -1.28330097e-01,  4.83062007e-02,
        -1.24150440e-02,  6.14518113e-02,  5.94375022e-02,
         3.58966626e-02,  4.60047200e-02,  9.96816829e-02,
        -2.71439832e-02, -8.44711140e-02,  1.57809872e-02,
         6.06710054e-02,  9.07780081e-02, -2.32378375e-02,
         1.06120734e-02, -2.46277750e-02,  1.51019944e-02,
        -2.39656940e-02,  4.68896236e-03, -1.91762242e-02,
         7.07543492e-02,  6.47879764e-02,  5.65436482e-03,
         8.41793343e-02, -8.19936395e-02,  5.08622378e-0