In [1]:
from pathlib import Path
import polars as pl
import duckdb
from datasets import Dataset, concatenate_datasets, load_from_disk
import numpy as np
from autofaiss import build_index
from src.item_matching.build_index.func_img import PipelineImage
from src.item_matching.build_index.model import Model

## 1) Data 

Prepare 2 datasets: Database and Query

In [2]:
path = Path('/home/kevin/Downloads/cb')
path_db = path / 'cb_2024-03-07.parquet'

# db
query = f"""
select *
,concat('http://f.shopee.vn/file/', UNNEST(array_slice(string_split(images, ','), 1, 1))) image_url
from parquet_scan('{str(path_db)}')
order by item_id, images
"""
df_db = (
    duckdb.sql(query).pl()
    .select(pl.all().name.prefix(f'db_'))
    .head(10_000)
)
pipe = PipelineImage(path, col_image='image_url')
df_img_db = pipe.load_images('db')
df_db = (
    df_db.drop(['images'])
    .join(df_img_db, on='db_image_url', how='left')
    .filter(pl.col('db_exists'))
)


# q
df_q = df_db.clone()
df_q.columns = [f'q_{i.split('db_')[1]}' for i in df_db.columns]
df_q.head()

Loading json in folder: 100%|██████████| 862006/862006 [02:02<00:00, 7013.91it/s]
Loading jpg in folder: 100%|██████████| 862006/862006 [00:00<00:00, 5214514.93it/s]


[32m13:34:30[0m | [1mINFO[0m | [36mload_images[0m | [1m[Data] Load Images: (862006, 3)[0m


q_item_id,q_item_name,q_shop_id,q_images,q_level1_global_be_category,q_create_datetime,q_image_url,q_file_path,q_exists
i64,str,i64,str,str,date,str,str,bool
2998791564,"""🍎FREE SHIP🍎Lit…",619038499,"""020b2fb32c9e0b…","""Fashion Access…",2022-03-16,"""http://f.shope…","""/home/kevin/Do…",True
2999430969,"""Dụng cụ cắt ra…",619035621,"""cn-11134207-7q…","""Home & Living""",2022-05-26,"""http://f.shope…","""/home/kevin/Do…",True
2999568833,"""Bộ 50 Món Dụng…",779448044,"""sg-11134201-22…","""Beauty""",2022-07-20,"""http://f.shope…","""/home/kevin/Do…",True
2999714346,"""[Hàng HOT] Giá…",619019560,"""sg-11134201-22…","""Automobiles""",2022-09-27,"""http://f.shope…","""/home/kevin/Do…",True
2999787165,"""SUPERCUTE Túi …",881303265,"""e0c117e68b628d…","""Women Bags""",2022-11-18,"""http://f.shope…","""/home/kevin/Do…",True


## 2) Embeddings

Use datasets and clip to transform images to vectors

In [3]:
img_model, img_processor = Model().get_img_model(model_id='openai/clip-vit-base-patch32')

[32m13:34:31[0m | [1mINFO[0m | [36m__init__[0m | [1m[Model] Run on: cuda[0m
[32m13:34:36[0m | [1mINFO[0m | [36mget_img_model[0m | [1mImage model: openai/clip-vit-base-patch32[0m


In [4]:
dataset = Dataset.from_pandas(df_db.to_pandas())
fn_kwargs = {'col': f'db_file_path', 'processor': img_processor, 'model': img_model}
dataset = dataset.map(Model().pp_img, batched=True, batch_size=128, fn_kwargs=fn_kwargs)
dataset.set_format(type='numpy', columns=['img_embed'], output_all_columns=True)

# save to disk
path_tmp_array = Path('tmp/array')
path_tmp_ds = Path('tmp/ds')
np.save(path_tmp_array / 'array.npy', dataset['img_embed'])
dataset.save_to_disk(path_tmp_ds / 'ds')

[32m13:34:36[0m | [1mINFO[0m | [36m__init__[0m | [1m[Model] Run on: cuda[0m


Map:   0%|          | 0/10191 [00:00<?, ? examples/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 450.00 MiB. GPU 0 has a total capacity of 7.78 GiB of which 265.81 MiB is free. Process 1610 has 132.68 MiB memory in use. Process 8291 has 17.46 MiB memory in use. Process 135909 has 294.00 MiB memory in use. Process 147787 has 3.47 GiB memory in use. Including non-PyTorch memory, this process has 3.29 GiB memory in use. Of the allocated memory 1.97 GiB is allocated by PyTorch, and 44.76 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## 3) Indexing

Build index to search items

In [None]:
path_index = Path('tmp/index')
build_index(
    str(path_tmp_array),
    index_path=str(path_index / f'ip.index'),
    index_infos_path=str(path_index / f'index.json'),
    save_on_disk=True,
    metric_type='ip',
    verbose=30,
)

Load index into datasets

In [None]:
dataset_db = concatenate_datasets([
    load_from_disk(str(f)) for f in sorted(path_tmp_ds.glob('*'))
])

# add index
dataset_db.load_faiss_index('img_embed', path_index / f'ip.index')

## 4) Retrieve

Batch search top-k from datasets

In [None]:
score, result = dataset_db.get_nearest_examples_batch(
    'img_embed',
    np.asarray(dataset_db['img_embed']),
    k=5
)

dict_ = {'score_img': [list(i) for i in score]}
df_score = pl.DataFrame(dict_)
df_result = pl.DataFrame(result).drop(['img_embed'])

## 5) Post process

In [None]:
df_match = pl.concat([df_q, df_result, df_score], how='horizontal')
col_explode = [i for i in df_match.columns if 'db' in i] + ['score_img']
df_match = df_match.explode(col_explode)

In [None]:
df_match

In [10]:
# df_match.write_csv(path / 'match.csv')