In [12]:
from transformers import BertTokenizer, TFAutoModelForSequenceClassification, AutoConfig, TFAutoModel
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
import numpy as np
import pandas as pd
import re
from sklearn import preprocessing
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard 
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping  
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.initializers import TruncatedNormal
from sklearn.model_selection import train_test_split
from scipy.spatial import distance

In [2]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [3]:
model = TFAutoModel.from_pretrained('Geotrend/bert-base-ru-cased')

Some layers from the model checkpoint at Geotrend/bert-base-ru-cased were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at Geotrend/bert-base-ru-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [4]:
tokenizer = BertTokenizer.from_pretrained('Geotrend/bert-base-ru-cased')

In [5]:
with open('datasets/qa_route_specs.txt', 'r') as source:
    text = source.readlines()

In [74]:
pattern = re.compile(r'Маршрут "(.+?)"')
names = [re.search(pattern, route).group(1) for route in text]

In [66]:
titles_tokenized = tokenizer(names, max_length=20, truncation=True, 
                   padding='max_length', return_token_type_ids=False, return_tensors='tf')

dataset = tf.data.Dataset.from_tensor_slices(titles_tokenized)
batched_dataset = dataset.batch(batch_size=128)
result = [model(batch, training=False)[1] for batch in batched_dataset]
titles_vectorized = tf.concat(result, axis=0)

### Инференс ускорили. Теперь попробуем разогнать расчет дистанций

In [67]:
import pymorphy2
def match(name):
    tokens2 = tokenizer([name], max_length=20, truncation=True, 
                   padding='max_length', return_token_type_ids=False, return_tensors='tf')
    out2 = model(**tokens2)
    populated_result = tf.repeat(out2[1], titles_vectorized.shape[0], axis=0)
    distances = tf.sqrt(tf.reduce_sum(tf.square(populated_result - titles_vectorized), 1))
    length = np.min(distances)
    index = np.argmin(distances)
    return index, length

def variants(sentence):
    result = []
    splitted = sentence.split()
    for i in range(len(splitted)):
        left_slice = splitted[i:]
        for j in range(i+1, len(left_slice) + i + 1):
            result.append(splitted[i:j])
    return result
    
def match_sentence(sentence, lemmer=pymorphy2.MorphAnalyzer()):
    perms = variants(sentence)
    best_match, best_score = -1, 999999
    for var in perms:
        processed = [lemmer.parse(token)[0].normal_form for token in var] if lemmer else var
        text = ' '.join(processed)
        index, length = match(text.lower())
        if length < best_score:
            best_match = index
            best_score = length
    
    print(f'{names[best_match]}, {best_score}')

In [68]:
%%time
match_sentence('какая длина трассы чемпион колхоза', lemmer=None)

чемпион колхоза, 0.0016712704673409462
CPU times: user 1.04 s, sys: 20.3 ms, total: 1.06 s
Wall time: 1.05 s


In [69]:
%%time
match_sentence('чемпион колхоза', lemmer=None)

чемпион колхоза, 0.0016712704673409462
CPU times: user 235 ms, sys: 67 µs, total: 235 ms
Wall time: 231 ms


In [71]:
%%time
match_sentence('какая категория у химии любви', lemmer=None)

любовь на кончиках пончика, 1.8347655534744263
CPU times: user 1.07 s, sys: 20.1 ms, total: 1.09 s
Wall time: 1.08 s


In [None]:
# Поняятно... формы слов в русском языке подводят..

In [72]:
%%time
match_sentence('какая категория у химия любви', lemmer=None)

химия любви, 0.0016301103169098496
CPU times: user 1.05 s, sys: 16.3 ms, total: 1.07 s
Wall time: 1.06 s


In [73]:
# Окей попробуем лемматизировать исходник

In [77]:
def preprocess(sentences, lemmer=pymorphy2.MorphAnalyzer()):
    return [' '.join([lemmer.parse(token)[0].normal_form.lower() for token in sent.split()]) for sent in sentences]

In [78]:
titles_tokenized = tokenizer(preprocess(names), max_length=20, truncation=True, 
                   padding='max_length', return_token_type_ids=False, return_tensors='tf')

dataset = tf.data.Dataset.from_tensor_slices(titles_tokenized)
batched_dataset = dataset.batch(batch_size=128)
result = [model(batch, training=False)[1] for batch in batched_dataset]
titles_vectorized = tf.concat(result, axis=0)

In [79]:
%%time
match_sentence('какая категория у химии любви')

Химия любви, 0.00162822799757123
CPU times: user 1.09 s, sys: 20.3 ms, total: 1.11 s
Wall time: 1.1 s


In [81]:
%%time
match_sentence('какая длина у Химеры')

Химера, 0.001562767312861979
CPU times: user 710 ms, sys: 4.09 ms, total: 714 ms
Wall time: 708 ms


In [82]:
%%time
match_sentence('категория крымского геккона')

Крымский геккон, 0.0016894518630579114
CPU times: user 434 ms, sys: 12.2 ms, total: 446 ms
Wall time: 441 ms


In [84]:
%%time
match_sentence('сколько нужно веревки на маршрут вентовка')

Вентовка, 0.001521328929811716
CPU times: user 1.47 s, sys: 8.22 ms, total: 1.48 s
Wall time: 1.47 s


In [86]:
%%time
match_sentence('какая станция у японского хрена')

Японский хрен, 0.0014782077632844448
CPU times: user 1.02 s, sys: 24.1 ms, total: 1.04 s
Wall time: 1.04 s


In [93]:
match_sentence('сколько питчей у мисячне сяйва')

Мисячне сяйво, 0.9865515828132629
