In [2]:
from pathlib import Path
import polars as pl
import duckdb
from datasets import Dataset
import numpy as np
from autofaiss import build_index
from FlagEmbedding import BGEM3FlagModel
from core_pro.ultilities import make_dir, make_sync_folder

In [3]:
path = make_sync_folder('dataset/item_matching')
file = path / 'data_sample_FMCG_clean.parquet'

query = f"""select * from read_parquet('{file}')"""
df = duckdb.sql(query).pl()
df.head()

index,item_id,item_name,shop_id,shop_name,level1_global_be_category,level2_global_be_category,level3_global_be_category,cluster,description,images,image_url,item_name_clean,file_path
u32,i64,str,i64,str,str,str,str,str,str,str,str,str,str
0,19092271907,"""LINEABON K2D3 nhập khẩu châu â…",1053944724,"""ChiChi.Kids17""","""Mom & Baby""","""Baby Healthcare""","""Baby Vitamins & Supplements""","""FMCG""","""Nguồn gốc xuất xứ K2 D3 Lineab…","""vn-11134207-7ras8-m49p5xv9ecb3…","""http://f.shopee.vn/file/vn-111…","""lineabon k2d3 nhập khẩu châu â…","""/media/kevin/data_4t/dataset/i…"
1,17398587723,"""[MKB Gift] Bộ 3 Khăn ướt Moony…",63522286,"""Moony - Gian Hàng Chính Hãng""","""Mom & Baby""","""Bath & Body Care""","""Wipes""","""FMCG""","""[{""t"":""Bộ 3 khăn ướt Moony 50 …","""eb26f55f7359e0de333f0c34e2619f…","""http://f.shopee.vn/file/eb26f5…","""bộ 3 khăn ướt moony 50 miếng/g…","""/media/kevin/data_4t/dataset/i…"
2,19111299596,"""Bàn Chải Đánh Răng Lipzo Sensi…",170502615,"""Niva Lipzo Official Store""","""Health""","""Personal Care""","""Oral Care""","""FMCG""","""[{""t"":""BÀN CHẢI ĐÁNH RĂNG LIPZ…","""vn-11134207-7ras8-m2kwpjg8vb46…","""http://f.shopee.vn/file/vn-111…","""bàn chải đánh răng lipzo sensi…","""/media/kevin/data_4t/dataset/i…"
3,25583412760,"""Rong biển ăn liền Bibizan siêu…",119247917,"""King 21""","""Food & Beverages""","""Snacks""","""Seaweed""","""FMCG""","""NGÀY IN TRÊN BAO BÌ LÀ NGÀY SẢ…","""vn-11134207-7r98o-lxqfhxwp3h0b…","""http://f.shopee.vn/file/vn-111…","""rong biển ăn liền bibizan siêu…","""/media/kevin/data_4t/dataset/i…"
4,15930150764,"""(MUA 2 SON TẶNG 1 TÚI, 1 CUSHI…",181790483,"""Lam Thảo Cosmetics""","""Beauty""","""Makeup""","""Lips""","""FMCG""","""Son Tint Bóng Espoir Couture L…","""vn-11134207-7r98o-lyrll7tp71y9…","""http://f.shopee.vn/file/vn-111…","""son tint bóng espoir couture l…","""/media/kevin/data_4t/dataset/i…"


In [10]:
name = 'bge'
path_tmp_array = Path(path / f'tmp/array/{name}')
path_tmp_ds = Path(path / f'tmp/ds/{name}')
make_dir(path_tmp_ds)
make_dir(path_tmp_array)

file_embed = path_tmp_array / 'embed.npy'
if not file_embed.exists():
    model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
    embeddings = model.encode(
        df['item_name_clean'].to_list(),
        batch_size=8,
        max_length=80,
        return_dense=True,
        return_sparse=False,
        return_colbert_vecs=False
    )['dense_vecs']
    np.save(file_embed, embeddings.astype(np.float64))
else:
    embeddings = np.load(file_embed)
print(embeddings.shape)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

pre tokenize: 100%|██████████| 12341/12341 [00:01<00:00, 6375.40it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 12341/12341 [01:07<00:00, 181.72it/s]


(98722, 1024)


In [8]:
embeddings.astype(np.float64).dtype

dtype('float64')

In [4]:
df = df.with_columns(pl.Series(values=embeddings, name='embed'))
dataset = Dataset.from_polars(df)
dataset.set_format(type='numpy', columns=['embed'], output_all_columns=True)

In [5]:
path_index = Path(path / 'tmp/index')
build_index(
    embeddings=embeddings,
    index_path=str(path_index / f'ip.index'),
    index_infos_path=str(path_index / f'index.json'),
    save_on_disk=True,
    metric_type='ip',
    verbose=30,
)

100%|██████████| 1/1 [00:00<00:00, 2743.17it/s]
100%|██████████| 1/1 [00:00<00:00,  6.35it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


(<faiss.swigfaiss_avx2.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x7b8858053ea0> >,
 {'index_key': 'HNSW15',
  'index_param': 'efSearch=5226',
  'index_path': '/home/kevin/Downloads/Item_Matching_Test/tmp/index/ip.index',
  'size in bytes': 42050218,
  'avg_search_speed_ms': 9.97823628438878,
  '99p_search_speed_ms': 10.557658101897687,
  'reconstruction error %': 0.0,
  'nb vectors': 9936,
  'vectors dimension': 1024,
  'compression ratio': 0.9678393581693203})

In [6]:
# add index
dataset.load_faiss_index('embed', path_index / f'ip.index')

In [7]:
score, result = dataset.get_nearest_examples_batch(
    'embed',
    np.asarray(dataset['embed']),
    k=5
)

dict_ = {'score': [list(i) for i in score]}
df_score = pl.DataFrame(dict_)
df_result = (
    pl.DataFrame(result).drop(['embed'])
    .select(pl.all().name.prefix(f'db_'))
)

In [8]:
df_match = pl.concat([df, df_result, df_score], how='horizontal')
col_explode = [i for i in df_match.columns if 'notebooks' in i] + ['score']

In [10]:
path_export = path / 'text_match'
make_dir(path_export)
df_match.write_parquet(path_export / f'{name}.parquet')