In [1]:
from IPython.display import clear_output
from img2vec_pytorch import Img2Vec
from PIL import Image

from pathlib import Path

from qdrant_client import QdrantClient

from more_itertools import chunked

import pandas as pd
import numpy as np
import tqdm
import conf

In [None]:
articles_df = pd.read_csv(
    "./data/articles.csv",
    dtype={"article_id": "str"}
)

In [2]:
articles_df.sample(n=5)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
76949,787185003,787185,SOLNA crewneck,252,Sweater,Garment Upper body,1010014,Placement print,73,Dark Blue,...,Young Boy Jersey Fancy,I,Children Sizes 134-170,4,Baby/Children,47,Young Boy,1005,Jersey Fancy,Top in sweatshirt fabric made from a cotton bl...
47344,688662002,688662,2P SKINNY CHEAPO,272,Trousers,Garment Lower body,1010023,Denim,7,Grey,...,Kids Boy Denim,H,Children Sizes 92-140,4,Baby/Children,46,Kids Boy,1016,Trousers Denim,Slim-fit jeans in different colours of superst...
66581,748566023,748566,SARGASSO HW ankle tights,273,Leggings/Tights,Garment Lower body,1010001,All over pattern,19,Greenish Khaki,...,Ladies Sport Bottoms,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,"High-waisted, ankle-length sports tights in fa..."
47137,687934001,687934,Sister off shoulder,252,Sweater,Garment Upper body,1010016,Solid,9,Black,...,Tops Fancy Jersey,D,Divided,2,Divided,53,Divided Collection,1005,Jersey Fancy,Off-the-shoulder top in sweatshirt fabric with...
13162,556584001,556584,Christina patent boot,87,Boots,Shoes,1010016,Solid,9,Black,...,Divided Shoes,D,Divided,2,Divided,52,Divided Accessories,1020,Shoes,Boots in imitation patent leather with lacing ...


In [3]:
columns = [
    "colour_group_name",
    "department_name",
    "garment_group_name",
    "graphical_appearance_name",
    "index_group_name",
    "index_name",
    "perceived_colour_master_name",
    "perceived_colour_value_name",
    "product_group_name",
    "product_type_name",
    "section_name",
]

In [6]:
filters = []
for col_name in columns:
    col_values = articles_df[col_name].unique()
    filters.append({
        "name": col_name,
        "display_name": col_name,
        "values": list(col_values)
    })

In [7]:
import json

In [10]:
with open("filters.json", "w") as fp:
    json.dump(filters, fp, indent=2)

In [None]:
img2vec = Img2Vec(cuda=True, model=conf.MODEL_NAME, layer_output_size=conf.VECTOR_SIZE)

In [None]:
client = QdrantClient(host=conf.QDRANT_HOST, port=conf.QDRANT_PORT)

In [None]:
client.recreate_collection(
    collection_name=conf.COLLECTION_NAME,
    distance=conf.DISTANCE_FUNCTION,
    vector_size=conf.VECTOR_SIZE,
)

In [None]:
image_paths = list(Path("./data/images").rglob("*.jpg"))

In [None]:
from qdrant_client.http.models import PointsBatch, Batch

In [None]:
chunks = chunked(image_paths, conf.BATCH_SIZE)
for image_batch in tqdm.notebook.tqdm(list(chunks)):
    ids = []
    payloads = []
    vectors = []
    
    for path in image_batch:
        try:
            article_id = path.stem
            ids.append(int(article_id))

            article_mask = articles_df["article_id"] == article_id
            article = articles_df[article_mask].iloc[0].replace({np.nan: None}).to_dict()
            payloads.append(article)

            img = Image.open(path).convert("RGB")
            vector = img2vec.get_vec(img, tensor=False).tolist()
            vectors.append(vector)
        except ValueError as e:
            print(e)

    client.http.points_api.upsert_points(
        collection_name=conf.COLLECTION_NAME,
        wait=True,
        point_insert_operations=PointsBatch(
            batch=Batch(
                ids=ids,
                payloads=payloads,
                vectors=vectors,
            )
        ),
    )