* pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu
* pip install transformers[torch]
* pip install nltk
* https://huggingface.co/DeepPavlov/rubert-base-cased/tree/main
* https://huggingface.co/DeepPavlov/rubert-base-cased-conversational/tree/main

In [1]:
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import nltk
import torch
import tqdm
import os
os.getcwd()


model = BertModel.from_pretrained(os.getcwd()+'/rubert-base-cased-conversational',
                                  #os.getcwd()+'/rubert-base-cased',
                                  output_hidden_states = True,
                                  )
tokenizer = BertTokenizer.from_pretrained(os.getcwd()+'/rubert-base-cased-conversational')
#tokenizer = BertTokenizer.from_pretrained(os.getcwd()+'/rubert-base-cased')

def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors
    
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings



import re
from pymorphy2 import MorphAnalyzer
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

morph = MorphAnalyzer()
patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
#stopwords_ru = stopwords.words("russian")
stopwords_ru=['другои', 'еи', 'какои', 'мои', 'неи', 'сеичас', 'такои', 'этои','и', 'в', 'во', 'не', 'что', 'он','на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне','было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впрочем', 'хорошо', 'свою', 'этой', 'перед', 'иногда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между']

def lemmatize(doc):
    doc = re.sub(patterns, ' ', doc)
    tokens = []
    for token in doc.split():
        if token and token not in stopwords_ru:
            token = token.strip()
            token = morph.normal_forms(token)[0]
            
            tokens.append(token.lower())
    if len(tokens) > 0:
        return tokens
    return None

df=pd.read_excel('catalog_v1.xlsx',sheet_name='1')
data=df['NAME'].apply(lemmatize).str.join(' ').tolist()

word_embeddings = []

for text in tqdm.tqdm(data):
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
    word_embeddings.append(list_token_embeddings)

sentences_embeddings=[np.array(i).mean(axis=0) for i in word_embeddings]
import pickle
pickle.dump(sentences_embeddings,open('sentences_embeddings.pkl','wb'))

Some weights of the model checkpoint at C:\Users\shaim\PYTHON_MAIN\!Prime\api_catalog\catalog_bert/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████

In [32]:
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import pickle
import os
import re
import tqdm
from pymorphy2 import MorphAnalyzer

df=pd.read_excel('catalog_v1.xlsx',sheet_name='1')
sentences_embeddings=pickle.load(open('sentences_embeddings.pkl','rb'))

model = BertModel.from_pretrained(os.getcwd()+'/rubert-base-cased-conversational',
                                  #os.getcwd()+'/rubert-base-cased',
                                  output_hidden_states = True,
                                  )
tokenizer = BertTokenizer.from_pretrained(os.getcwd()+'/rubert-base-cased-conversational')
#tokenizer = BertTokenizer.from_pretrained(os.getcwd()+'/rubert-base-cased')

def bert_text_preparation(text, tokenizer):
    """Preparing the input for BERT
    
    Takes a string argument and performs
    pre-processing like adding special tokens,
    tokenization, tokens to ids, and tokens to
    segment ids. All tokens are mapped to seg-
    ment id = 1.
    
    Args:
        text (str): Text to be converted
        tokenizer (obj): Tokenizer object
            to convert text into BERT-re-
            adable tokens and ids
        
    Returns:
        list: List of BERT-readable tokens
        obj: Torch tensor with token ids
        obj: Torch tensor segment ids
    
    
    """
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors
    
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    """Get embeddings from an embedding model
    
    Args:
        tokens_tensor (obj): Torch tensor size [n_tokens]
            with token ids for each token in text
        segments_tensors (obj): Torch tensor size [n_tokens]
            with segment ids for each token in text
        model (obj): Embedding model to generate embeddings
            from token and segment ids
    
    Returns:
        list: List of list of floats of size
            [n_tokens, n_embedding_dimensions]
            containing embeddings for each token
    
    """
    
    # Gradient calculation id disabled
    # Model is in inference mode
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        # Removing the first hidden state
        # The first state is the input state
        hidden_states = outputs[2][1:]

    # Getting embeddings from the final BERT layer
    token_embeddings = hidden_states[-1]
    # Collapsing the tensor into 1-dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    # Converting torchtensors to lists
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

morph = MorphAnalyzer()
patterns = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
#stopwords_ru = stopwords.words("russian")
stopwords_ru=['другои', 'еи', 'какои', 'мои', 'неи', 'сеичас', 'такои', 'этои','и', 'в', 'во', 'не', 'что', 'он','на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне','было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впрочем', 'хорошо', 'свою', 'этой', 'перед', 'иногда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между']

def lemmatize(doc):
    doc = re.sub(patterns, ' ', doc)
    tokens = []
    for token in doc.split():
        if token and token not in stopwords_ru:
            token = token.strip()
            token = morph.normal_forms(token)[0]
            
            tokens.append(token.lower())
    if len(tokens) > 0:
        return tokens
    return None

def similar_product(search_text):
    #search_text='бумага'
    a=[search_text]
    a=' '.join(pd.Series(a).apply(lemmatize)[0])

    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(a, tokenizer)
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
    sentences_embeddings_one=np.array(list_token_embeddings).mean(axis=0)


    t=[cosine_similarity(sentences_embeddings_one.reshape(1, -1),sentences_embeddings[i].reshape(1, -1).reshape(1, -1))[0][0] for i in tqdm.tqdm(range(len(sentences_embeddings)))]
    t2=pd.DataFrame([t,df['CODE'],df['NAME']]).T.rename(columns={0:'SCORE',1:'CODE',2:'NAME'}).sort_values('SCORE',ascending=False).head(10)[['CODE','NAME','SCORE']]    

    return t2

Some weights of the model checkpoint at C:\Users\shaim\PYTHON_MAIN\!Prime\api_catalog\catalog_bert/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
similar_product('бумага')

100%|████████████████████████████████████████████████████████████████████████████| 3808/3808 [00:01<00:00, 2806.14it/s]


Unnamed: 0,CODE,NAME,SCORE
864,280110,Хлор,0.782148
2483,821210,Бритвы,0.77462
3204,851713,Смартфоны,0.766788
2989,846722,Пилы,0.74976
844,482020,Тетради,0.748307
1,40320,Йогурт,0.743987
679,630630,Паруса,0.735235
865,280120,Йод,0.716003
3552,900220,Фильтры,0.69942
1405,330510,Шампуни,0.697684


In [34]:
similar_product('бумага и картон')

100%|████████████████████████████████████████████████████████████████████████████| 3808/3808 [00:01<00:00, 3051.22it/s]


Unnamed: 0,CODE,NAME,SCORE
814,481099,Бумага и картон прочие,0.929895
780,480210,Бумага и картон ручного отлива,0.906092
851,482320,Бумага и картон фильтровальные,0.843468
795,480540,Бумага и картон фильтровальные,0.843468
782,480240,Бумага - основа для обоев,0.841466
1374,320820,Краски и лаки на основе акриловых или виниловы...,0.820727
1257,292610,Акрилонитрил,0.806375
1836,392059,"Прочие плиты, листы, пленка и полосы из акрило...",0.787333
791,480519,Прочая бумага для гофрирования,0.78306
2815,843920,Оборудование для изготовления бумаги или картона,0.764758
