In [21]:
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
from tqdm import tqdm
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
import time
import torch

import pymorphy2
import re
import spacy

import os
print(os.cpu_count())

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/multilingual-e5-large')

64


## read data

In [2]:
df = pd.read_parquet("data/data_analogue/09180919.parquet.gzip")
print(df.shape)

titles = pd.read_csv("data/titles.csv")
titles.columns = ["title", "price"]
print(titles.shape)

(2362742, 16)
(825, 2)


## extracta unidades

In [3]:
def extract_units(v):
    pattern =  r'(\d+(?:[\.,]\d+)?)\s*(мл|гр|л.|шт|кг)'

    return re.findall(pattern, v)

titles["units"] = titles.title.apply(extract_units)
titles["value"] = titles.units.apply(lambda v: v[0][0] if len(v) > 0 else v)
titles["unit"] = titles.units.apply(lambda v: v[0][1] if len(v) > 0 else v)

In [4]:
df["units"] = df.title.apply(extract_units)
df["value"] = df.units.apply(lambda v: v[0][0] if len(v) > 0 else v)
df["unit"] = df.units.apply(lambda v: v[0][1] if len(v) > 0 else v)

In [5]:
df[df.unit.astype(str) != "[]"][["title", "units", "value", "unit"]].head(5)

Unnamed: 0,title,units,value,unit
30,"Перчатки нитриловые XS зеленые, 100шт","[(100, шт)]",100,шт
40,Забор ограждение коричневый Кирпич пластик 25с...,"[(2, шт)]",2,шт
52,Солнцезащитный увлажняющий крем для лица SPF30...,"[(50, мл)]",50,мл
83,"Краска для волос PERFORMANCE 6.22, 60 мл","[(60, мл)]",60,мл
108,"Light Ресницы чёрные шайн 16 лент микс D 0,07 ...","[(16, ле)]",16,ле


## clear title

In [8]:
def remove_units(row):
    title = row['title']
    val = row['value']
    unit = row["unit"]
    return title.replace(str(val), '').replace(str(unit), '').strip()

titles["title_clear"] = titles.apply(remove_units, axis=1)
titles.title_clear = titles.title_clear.str.replace(".", "")
titles.title_clear = titles.title_clear.str.replace("#", "")
titles.title_clear = titles.title_clear.str.replace("*", "")
titles.title_clear = titles.title_clear.str.replace("д/", "для ")
titles.title_clear = titles.title_clear.str.replace("/", " ")
titles.title_clear = titles.title_clear.apply(lambda v: re.sub(r"\d", "", v))

  titles.title_clear = titles.title_clear.str.replace(".", "")
  titles.title_clear = titles.title_clear.str.replace("*", "")


In [9]:
df["title_clear"] = df.apply(remove_units, axis=1)
df.title_clear = df.title_clear.str.replace(".", "")
df.title_clear = df.title_clear.str.replace("#", "")
df.title_clear = df.title_clear.str.replace("*", "")
df.title_clear = df.title_clear.str.replace("д/", "для ")
df.title_clear = df.title_clear.str.replace("/", " ")
df.title_clear = df.title_clear.apply(lambda v: re.sub(r"\d", "", v))

  df.title_clear = df.title_clear.str.replace(".", "")
  df.title_clear = df.title_clear.str.replace("*", "")


## transformer

In [11]:
texts = ["query: " + sent for sent in titles.title_clear.to_list()]
embeddings = model.encode(texts, normalize_embeddings=True)
embeddings_titles = dict(zip(titles.title_clear.to_list(), embeddings))

In [None]:
# texts = ["query: " + sent for sent in df.title_clear.to_list()]
# embeddings = model.encode(texts, normalize_embeddings=True)
# embeddings_df = dict(zip(titles.title_clear.to_list(), embeddings))

In [18]:
embeddings_dict = {}

In [None]:
batch_size = 100
total_batches = (len(df) + batch_size - 1) // batch_size

for i in tqdm(range(total_batches), desc="Processing Batches"):
    dfc = df[i*batch_size:(i+1)*batch_size].reset_index(drop=True)
    toEmbed = ["query: " + sent for sent in dfc.title_clear.to_list()]
    embeddings_dict.update((dict(zip(dfc.title_clear, model.encode(toEmbed)))))
    if i % 10 == 0:
        with open("data/data_analogue/embeddings_dict.pkl", "wb") as pickle_file:
            pickle.dump(embeddings_dict, pickle_file)
        with open(f"data/data_analogue/cnt/{i}.pkl", "wb") as pickle_file:
            pickle.dump(i, pickle_file)

Processing Batches:  16%|█▌        | 3801/23628 [1:44:11<12:56:31,  2.35s/it]

## similiarity

In [32]:
target_item = 'CALGON Порошок для стир машин   пакет  '
target_embedding = embeddings_titles[target_item]
target_embedding

array([ 0.01765255, -0.01208359, -0.04187283, ..., -0.02845209,
       -0.04686246, -0.02352369], dtype=float32)

In [50]:
# text_embeddings = {k:embeddings_dict[k] for k in list(embeddings_dict)[:3]}
# list(text_embeddings.values())

[array([-0.00190464, -0.01742457, -0.01704312, ..., -0.01980822,
        -0.00352882,  0.00061696], dtype=float32),
 array([ 0.00955767, -0.02369396, -0.02404957, ...,  0.01609664,
        -0.07823528,  0.00591879], dtype=float32),
 array([ 0.00413634, -0.01285618, -0.03304748, ..., -0.021833  ,
        -0.02210444, -0.00267873], dtype=float32)]

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [56]:
similarities = [cosine_similarity(
    [embeddings_titles[target_item]],
    [embedding]
)[0][0] for embedding in embeddings_dict.values()]
     
print(similarities)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [59]:
sorted_embeddings = [x for _, x in sorted(
    zip(similarities, embeddings_dict), reverse=True
)]

num_closest_embeddings = 10
sorted_embeddings[:num_closest_embeddings]

['Калгон порошок для стиральных машин  г',
 'Средство для стиральной машины Calgon порошок в,',
 'Calgon Порошок для смягчения воды Calgon',
 'Смягчитель воды для стиральных машин Calgon  в , порошок',
 'Порошок для смягчения воды Calgon  ',
 'Порошок от накипи Calgon',
 'Сalgon Порошок для смягчения воды г',
 'Смягчитель воды для стиральных машин Calgon в, порошок, …',
 'Смягчитель воды для стиральных машин Calgon  в , гель',
 'Соль для посудомоечных машин Calgonit,']