In [1]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle

import torch
import torch.nn as nn

import gensim
from transformers import get_linear_schedule_with_warmup

import sys
import os
import wget
import re
from ufal.udpipe import Model, Pipeline

In [2]:
model = gensim.models.KeyedVectors.load_word2vec_format('models/w2v/model.bin', binary=True)

In [3]:
weight = torch.FloatTensor(model.vectors)
embedding = nn.Embedding.from_pretrained(weight)

In [4]:
embedding

Embedding(249565, 300)

In [13]:
embedding = nn.Embedding.from_pretrained(torch.FloatTensor(gensim.models.KeyedVectors.load_word2vec_format('models/w2v/model.txt').vectors))

In [3]:
class DatasetClass(torch.utils.data.Dataset):
    """
    Dataset for smart batching, that is each batch is only padded to its longest sequence instead of padding all
    sequences to the max length.
    The SentenceBertEncoder.smart_batching_collate is required for this to work.
    SmartBatchingDataset does *not* work without it.
    """
    def __init__(self, features: np.array, target: np.array, vocab: list):
        """
        Create a new Dataset with the tokenized texts and the labels as Tensor
        """
        self.tokenizer = Tokenizer()
        self.tok2id = {}
        for i, token in enumerate(vocab):
            self.tok2id.update({token: i})

        self.features = self.make_tokens(features)
        self.target = target

    def make_tokens(self, texts):
        tokens = []
        for text in tqdm(texts, desc='Tokenizing...'):
            encoded_text = self.tokenizer.encode(text)
            #tokens to indexes
            indexes = []
            for token in encoded_text:
                try:
                    indexes.append(self.tok2id[token])
                #map oov tokens
                except KeyError:
                    indexes.append(-1)

            tokens.append(indexes)

        return tokens

    def collate_fn(self, batch):
        """
        Transforms a batch from a Dataset to a batch of tensors for the model
        :param batch:
            a batch from a Dataset
        :return:
            a batch of tensors for the model
        """
        tokens, labels = [], []
        for token, label in batch:
            tokens.append(token)
            labels.append(label)

        max_len = 0
        for token in tokens:
            max_len = max(max_len, len(token))
        
        for i, token in enumerate(tokens):
            padding_length = max_len - len(token)
            if padding_length > 0:
                token = token + ([-2] * padding_length)

            tokens[i] = torch.tensor(token)

        tokens = torch.stack(tokens)

        return tokens, torch.argmax(torch.tensor(labels, dtype=torch.long), dim=1)

    def __getitem__(self, item):
        return self.features[item], self.target[item]

    def __len__(self):
        return len(self.features)

class Tokenizer(): 
    def __init__(self):
        udpipe_model_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
        udpipe_filename = udpipe_model_url.split('/')[-1]

        if not os.path.isfile(udpipe_filename):
            print('UDPipe model not found. Downloading...')
            wget.download(udpipe_model_url)

        print('\nLoading the model...')
        self.model = Model.load(udpipe_filename)
        self.process_pipeline = Pipeline(self.model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
        
    def num_replace(self, word):
        newtoken = 'x' * len(word)
        return newtoken

    def clean_token(self, token, misc):
        """
        :param token:  токен (строка)
        :param misc:  содержимое поля "MISC" в CONLLU (строка)
        :return: очищенный токен (строка)
        """
        out_token = token.strip().replace(' ', '')
        if token == 'Файл' and 'SpaceAfter=No' in misc:
            return None
        return out_token


    def clean_lemma(self, lemma, pos):
        """
        :param lemma: лемма (строка)
        :param pos: часть речи (строка)
        :return: очищенная лемма (строка)
        """
        out_lemma = lemma.strip().replace(' ', '').replace('_', '').lower()
        if '|' in out_lemma or out_lemma.endswith('.jpg') or out_lemma.endswith('.png'):
            return None
        if pos != 'PUNCT':
            if out_lemma.startswith('«') or out_lemma.startswith('»'):
                out_lemma = ''.join(out_lemma[1:])
            if out_lemma.endswith('«') or out_lemma.endswith('»'):
                out_lemma = ''.join(out_lemma[:-1])
            if out_lemma.endswith('!') or out_lemma.endswith('?') or out_lemma.endswith(',') \
                    or out_lemma.endswith('.'):
                out_lemma = ''.join(out_lemma[:-1])
        return out_lemma


    def list_replace(self, search, replacement, text):
        search = [el for el in search if el in text]
        for c in search:
            text = text.replace(c, replacement)
        return text


    def unify_sym(self, text):  # принимает строку в юникоде
        text = self.list_replace \
            ('\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019', '\u0022', text)

        text = self.list_replace \
            ('\u2012\u2013\u2014\u2015\u203E\u0305\u00AF', '\u2003\u002D\u002D\u2003', text)

        text = self.list_replace('\u2010\u2011', '\u002D', text)

        text = self.list_replace \
                (
                '\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
                '\u2002', text)

        text = re.sub('\u2003\u2003', '\u2003', text)
        text = re.sub('\t\t', '\t', text)

        text = self.list_replace \
                (
                '\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
                '.', text)

        text = self.list_replace('\u2217', '\u002A', text)

        text = self.list_replace('…', '...', text)

        text = self.list_replace('\u2241\u224B\u2E2F\u0483', '\u223D', text)

        text = self.list_replace('\u00C4', 'A', text)  # латинская
        text = self.list_replace('\u00E4', 'a', text)
        text = self.list_replace('\u00CB', 'E', text)
        text = self.list_replace('\u00EB', 'e', text)
        text = self.list_replace('\u1E26', 'H', text)
        text = self.list_replace('\u1E27', 'h', text)
        text = self.list_replace('\u00CF', 'I', text)
        text = self.list_replace('\u00EF', 'i', text)
        text = self.list_replace('\u00D6', 'O', text)
        text = self.list_replace('\u00F6', 'o', text)
        text = self.list_replace('\u00DC', 'U', text)
        text = self.list_replace('\u00FC', 'u', text)
        text = self.list_replace('\u0178', 'Y', text)
        text = self.list_replace('\u00FF', 'y', text)
        text = self.list_replace('\u00DF', 's', text)
        text = self.list_replace('\u1E9E', 'S', text)

        currencies = list \
                (
                '\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
            )

        alphabet = list \
                (
                '\t\n\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯ,.[]{}()=+-−*&^%$#@!?~;:0123456789§/\|"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')

        alphabet.append("'")

        allowed = set(currencies + alphabet)

        cleaned_text = [sym for sym in text if sym in allowed]
        cleaned_text = ''.join(cleaned_text)

        return cleaned_text


    def process(self, pipeline, text='Строка', keep_pos=True, keep_punct=False):
        # Если частеречные тэги не нужны (например, их нет в модели), выставьте pos=False
        # в этом случае на выход будут поданы только леммы
        # По умолчанию знаки пунктуации вырезаются. Чтобы сохранить их, выставьте punct=True

        entities = {'PROPN'}
        named = False
        memory = []
        mem_case = None
        mem_number = None
        tagged_propn = []

        # обрабатываем текст, получаем результат в формате conllu:
        processed = pipeline.process(text)

        # пропускаем строки со служебной информацией:
        content = [l for l in processed.split('\n') if not l.startswith('#')]

        # извлекаем из обработанного текста леммы, тэги и морфологические характеристики
        tagged = [w.split('\t') for w in content if w]

        for t in tagged:
            if len(t) != 10:
                continue
            (word_id, token, lemma, pos, xpos, feats, head, deprel, deps, misc) = t
            token = self.clean_token(token, misc)
            lemma = self.clean_lemma(lemma, pos)
            if not lemma or not token:
                continue
            if pos in entities:
                if '|' not in feats:
                    tagged_propn.append('%s_%s' % (lemma, pos))
                    continue
                morph = {el.split('=')[0]: el.split('=')[1] for el in feats.split('|')}
                if 'Case' not in morph or 'Number' not in morph:
                    tagged_propn.append('%s_%s' % (lemma, pos))
                    continue
                if not named:
                    named = True
                    mem_case = morph['Case']
                    mem_number = morph['Number']
                if morph['Case'] == mem_case and morph['Number'] == mem_number:
                    memory.append(lemma)
                    if 'SpacesAfter=\\n' in misc or 'SpacesAfter=\s\\n' in misc:
                        named = False
                        past_lemma = '::'.join(memory)
                        memory = []
                        tagged_propn.append(past_lemma + '_PROPN')
                else:
                    named = False
                    past_lemma = '::'.join(memory)
                    memory = []
                    tagged_propn.append(past_lemma + '_PROPN')
                    tagged_propn.append('%s_%s' % (lemma, pos))
            else:
                if not named:
                    if pos == 'NUM' and token.isdigit():  # Заменяем числа на xxxxx той же длины
                        lemma = self.num_replace(token)
                    tagged_propn.append('%s_%s' % (lemma, pos))
                else:
                    named = False
                    past_lemma = '::'.join(memory)
                    memory = []
                    tagged_propn.append(past_lemma + '_PROPN')
                    tagged_propn.append('%s_%s' % (lemma, pos))

        if not keep_punct:
            tagged_propn = [word for word in tagged_propn if word.split('_')[1] != 'PUNCT']
        if not keep_pos:
            tagged_propn = [word.split('_')[0] for word in tagged_propn]
        return tagged_propn

    def encode(self, inp):
        res = self.unify_sym(inp.strip())
        output = self.process(self.process_pipeline, text=res)
        return output

In [4]:
text = '''Этот скрипт принимает на вход необработанный русский текст 
(одно предложение на строку или один абзац на строку).
Он токенизируется, лемматизируется и размечается по частям речи с использованием UDPipe.
На выход подаётся последовательность разделенных пробелами лемм с частями речи 
("зеленый_NOUN трамвай_NOUN").
Их можно непосредственно использовать в моделях с RusVectōrēs (https://rusvectores.org).
Примеры запуска:
echo 'Мама мыла раму.' | python3 rus_preprocessing_udpipe.py
zcat large_corpus.txt.gz | python3 rus_preprocessing_udpipe.py | gzip > processed_corpus.txt.gz'''

In [4]:
tok = Tokenizer()


Loading the model...


In [5]:
tok.encode(text)

['этот_DET',
 'скрипт_NOUN',
 'принимать_VERB',
 'на_ADP',
 'вход_NOUN',
 'необработать_ADJ',
 'русский_ADJ',
 'текст_NOUN',
 'один_ADJ',
 'предложение_NOUN',
 'на_ADP',
 'строка_NOUN',
 'или_CCONJ',
 'один_NUM',
 'абзац_NOUN',
 'на_ADP',
 'строка_NOUN',
 'он_PRON',
 'токенизируться_VERB',
 'лемматизируться_VERB',
 'и_CCONJ',
 'размечаться_VERB',
 'по_ADP',
 'часть_NOUN',
 'речь_NOUN',
 'с_ADP',
 'использование_NOUN',
 'udpipe_PROPN',
 'на_ADP',
 'выход_NOUN',
 'подаваться_VERB',
 'последовательность_NOUN',
 'разделять_VERB',
 'пробелай_NOUN',
 'лем_NOUN',
 'с_ADP',
 'часть_NOUN',
 'речь_NOUN',
 'зеленыйnoun_PROPN',
 'трамвайnoun_PROPN',
 'они_PRON',
 'можно_ADV',
 'непосредственно_ADV',
 'использовать_VERB',
 'в_ADP',
 'модель_NOUN',
 'с_ADP',
 'rusvectrs_PROPN',
 'https://rusvectores.org_X',
 'пример_NOUN',
 'запуск_NOUN',
 'echo_X',
 "'мам_PROPN",
 'мыть_VERB',
 "раму.'_NOUN",
 'python3_NUM',
 'ruspreprocessingudpipe.py_X',
 'zcat_X',
 'largecorpus.txt.gz_PROPN',
 'python3_NUM',
 'r

In [12]:
df_train = pd.read_parquet('data/val.parquet')

In [42]:
train_ds = DatasetClass(features=df_train.loc[:100, 'question'].to_numpy(), target=df_train.drop('question', axis=1).loc[:100, :].to_numpy(), 
                        vocab=model.index2word)

Tokenizing...:   1%|          | 1/101 [00:00<00:11,  8.66it/s]
Loading the model...
Tokenizing...: 100%|██████████| 101/101 [00:03<00:00, 30.10it/s]


In [44]:
loader = torch.utils.data.DataLoader(train_ds, collate_fn=train_ds.collate_fn,
                                     batch_size=2, num_workers=4, shuffle=True)

In [45]:
for batch in loader:
    print(batch)
    break

(tensor([[    -1,   5901,      2,    863,  11939,   1288,     -1,   2914,      2,
           1382,    538,     -1,     -1,    876,    632,     -1,     -1,   1864,
            445,   6713,   1707,   8092,   3556,  13220,   2315,    538,   4272,
           7798,     -1,     -1,  43734,     -1,   1288,     -1,    905,     -1,
           2315,  63059,     -1,  11738,     -1,     23,    702,     -1,     -1,
          31673,   2664,   3277,     -1, 204119,     -1,     69,     -1,     25,
          45452,    791,   1695,     -1,      9,   1288,     -1,  32984,     -1,
             28,     -1,   4570,    245,  80258,    159,     -1,    489,  31673,
          34651,     -1,     -1,     -1,   1695,     -1,   1288,     87,     12,
           1273,     -1,     25,      0,    712,  25307,     -1,     -1,     56,
              1,      2,    106,      9,   2914,     -1,    400,     -1,     -1,
            101,    661,     -1,     -1,     -1,     -1,  31673,   1288,     -1,
          10572,     -1,  1

In [26]:
batch = train_ds[:2]

In [29]:
tokens = batch[0]
labels = batch[1]

In [27]:
tokens, labels = [], []
for token, label in batch:
    print(label)
    tokens.append(token)
    labels.append(label)

[-1, -1, 702, -1, -1, -1, 1506, 2417, -1, 923, -1, 445, 51196, 923, -1, -1, -1, 32477]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0]


In [30]:
max_len = 0
for token in tokens:
    max_len = max(max_len, len(token))

In [31]:
max_len

210

In [38]:
for i, token in enumerate(tokens):
    padding_length = max_len - len(token)
    if padding_length > 0:
        token = token + ([-2] * padding_length)

    tokens[i] = torch.tensor(token)

print(tokens)


[tensor([    -1,   5901,    179,   3228,    101,     -1,      0,     -1,     15,
            -1,   1764,     -1, 105089,    101,     -1,    197,     -1,    631,
             2,    863,  69553,     -1,     91,    671,   1011,     -1,   1976,
            44,     -1,     10,  56739,     -1,     91,   1976,     -1,   1976,
            -1,    745,     -1,   6280,     -1,   5917,    304,    323,     -1,
            -1,     -1,    257,     -1,     89,     73,    596,     -1,   1538,
            -1,     -1,     70,     -1,     -1,    138,     -1,   1976,    671,
            -1,     50,    670,  74687,     -1,     66,   1976,   2997,    191,
            -1,     -1,  21695,     -1, 217725,     -1,    380,     -1,   1377,
            -1,    635,  15971,    686,     -1,     -1,  15971,     -1,     -1,
          1976,      0,     -1,  17727,   2030,     -1,      6,     -1,     -1,
          1542,     -1,   1976,    671,     -1,     50,    670,      1,    939,
            -1,  44886,   2823,     -1,

In [39]:
tokens = torch.stack(tokens)

In [40]:
tokens

tensor([[    -1,   5901,    179,   3228,    101,     -1,      0,     -1,     15,
             -1,   1764,     -1, 105089,    101,     -1,    197,     -1,    631,
              2,    863,  69553,     -1,     91,    671,   1011,     -1,   1976,
             44,     -1,     10,  56739,     -1,     91,   1976,     -1,   1976,
             -1,    745,     -1,   6280,     -1,   5917,    304,    323,     -1,
             -1,     -1,    257,     -1,     89,     73,    596,     -1,   1538,
             -1,     -1,     70,     -1,     -1,    138,     -1,   1976,    671,
             -1,     50,    670,  74687,     -1,     66,   1976,   2997,    191,
             -1,     -1,  21695,     -1, 217725,     -1,    380,     -1,   1377,
             -1,    635,  15971,    686,     -1,     -1,  15971,     -1,     -1,
           1976,      0,     -1,  17727,   2030,     -1,      6,     -1,     -1,
           1542,     -1,   1976,    671,     -1,     50,    670,      1,    939,
             -1,  44886,   2

In [49]:
class Embedding(torch.nn.Module): 
    def __init__(self, vectors): 
        super().__init__() 
        self.embedding = torch.nn.Embedding.from_pretrained(torch.FloatTensor(vectors))
        self.embedding.weight.requires_grad = False
        # vector for oov 
        self.dim = self.embedding.embedding_dim 
        self.oov = torch.nn.Parameter(data=torch.rand(1, self.dim)) 
        self.oov_index = -1
        self.pad = torch.nn.Parameter(data=torch.zeros(1, self.dim))
        self.pad.requires_grad = False
        self.pad_index = -2

    def forward(self, arr): 
        N, M = arr.shape
        mask =  (arr==self.oov_index).long() 
        mask_ = mask.unsqueeze(dim=2).expand(-1, -1, self.dim).float() 
        pad_mask = (arr==self.pad_index).long()
        pad_ = pad_mask.unsqueeze(dim=2).expand(-1, -1, self.dim).float()
        embed = (1-mask_)*(1-pad_)*self.embedding((1-mask)*((1-pad_mask)*arr))\
                + mask_*(self.oov.expand((N, M, self.dim))) + pad_*(self.pad.expand((N, M, self.dim)))

        return embed

In [50]:
emb = Embedding(model.vectors)

In [7]:
texts = text.split('\n')
texts

['Этот скрипт принимает на вход необработанный русский текст ',
 '(одно предложение на строку или один абзац на строку).',
 'Он токенизируется, лемматизируется и размечается по частям речи с использованием UDPipe.',
 'На выход подаётся последовательность разделенных пробелами лемм с частями речи ',
 '("зеленый_NOUN трамвай_NOUN").',
 'Их можно непосредственно использовать в моделях с RusVectōrēs (https://rusvectores.org).',
 'Примеры запуска:',
 "echo 'Мама мыла раму.' | python3 rus_preprocessing_udpipe.py",
 'zcat large_corpus.txt.gz | python3 rus_preprocessing_udpipe.py | gzip > processed_corpus.txt.gz']

In [8]:
ds = DatasetClass(features=texts, target=np.eye(len(texts), 2), vocab=model.index2word)
dl = torch.utils.data.DataLoader(ds, collate_fn=ds.collate_fn, batch_size=3, num_workers=4, shuffle=True)

Tokenizing...: 100%|██████████| 9/9 [00:00<00:00, 421.63it/s]
Loading the model...



In [10]:
for batch in dl:
    x = batch[0]
    

In [52]:
v = emb(x)

In [54]:
x

tensor([[   631,   9634,     -2,     -2,     -2,     -2],
        [186581,     -1,   4487,     -1,     -1,     -1],
        [    -1,     -1,     -1,     -1,     -1,     -1]])

In [55]:
v[1]

tensor([[ 0.1304,  0.0264, -0.4213,  ..., -0.0208,  0.4676, -0.1550],
        [ 0.9763,  0.8876,  0.8292,  ...,  0.8711,  0.0260,  0.2194],
        [ 0.0089,  0.0826, -0.2734,  ..., -0.0286,  0.6359, -0.3102],
        [ 0.9763,  0.8876,  0.8292,  ...,  0.8711,  0.0260,  0.2194],
        [ 0.9763,  0.8876,  0.8292,  ...,  0.8711,  0.0260,  0.2194],
        [ 0.9763,  0.8876,  0.8292,  ...,  0.8711,  0.0260,  0.2194]],
       grad_fn=<SelectBackward>)

In [12]:
arr=x

In [13]:
mask =  (arr==emb.oov_index).long() 
mask_ = mask.unsqueeze(dim=1).float() 
pad_mask = (arr==emb.pad_index).long()
pad_ = pad_mask.unsqueeze(dim=1).float()

In [16]:
mask_.shape

torch.Size([3, 1, 6])

In [19]:
arr

tensor([[   631,   9634,     -2,     -2,     -2,     -2],
        [186581,     -1,   4487,     -1,     -1,     -1],
        [    -1,     -1,     -1,     -1,     -1,     -1]])

In [21]:
mask

tensor([[0, 0, 0, 0, 0, 0],
        [0, 1, 0, 1, 1, 1],
        [1, 1, 1, 1, 1, 1]])

In [20]:
pad_mask

tensor([[0, 0, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]])

In [22]:
((1-mask)*((1-pad_mask)*arr))

tensor([[   631,   9634,      0,      0,      0,      0],
        [186581,      0,   4487,      0,      0,      0],
        [     0,      0,      0,      0,      0,      0]])

In [27]:
A = emb.embedding((1-mask)*((1-pad_mask)*arr))

In [36]:
N = arr.shape[0]
M = arr.shape[1]
B = emb.oov.expand((N, M, emb.dim))

In [37]:
C = emb.pad.expand((N, M, emb.dim))

In [45]:
mask.unsqueeze(2).expand(-1, -1, 300).size()

torch.Size([3, 6, 300])

In [38]:
A+B+C

tensor([[[ 1.0027,  0.3766,  0.1932,  ...,  0.9070,  0.7093,  0.0137],
         [ 1.0005,  0.4485,  1.1348,  ...,  0.7723,  0.6764,  0.0473],
         [ 0.4351,  0.0441,  0.6284,  ...,  0.7290,  0.5911,  0.0998],
         [ 0.4351,  0.0441,  0.6284,  ...,  0.7290,  0.5911,  0.0998],
         [ 0.4351,  0.0441,  0.6284,  ...,  0.7290,  0.5911,  0.0998],
         [ 0.4351,  0.0441,  0.6284,  ...,  0.7290,  0.5911,  0.0998]],

        [[ 0.8453,  0.5549,  0.1379,  ...,  0.7751,  1.0436,  0.0220],
         [ 0.4351,  0.0441,  0.6284,  ...,  0.7290,  0.5911,  0.0998],
         [ 0.7237,  0.6111,  0.2858,  ...,  0.7673,  1.2119, -0.1332],
         [ 0.4351,  0.0441,  0.6284,  ...,  0.7290,  0.5911,  0.0998],
         [ 0.4351,  0.0441,  0.6284,  ...,  0.7290,  0.5911,  0.0998],
         [ 0.4351,  0.0441,  0.6284,  ...,  0.7290,  0.5911,  0.0998]],

        [[ 0.4351,  0.0441,  0.6284,  ...,  0.7290,  0.5911,  0.0998],
         [ 0.4351,  0.0441,  0.6284,  ...,  0.7290,  0.5911,  0.0998],
  

In [17]:
(1-mask_)*(1-pad_)*emb.embedding((1-mask)*((1-pad_mask)*arr))

RuntimeError: The size of tensor a (6) must match the size of tensor b (300) at non-singleton dimension 2