In [82]:
import json
import typing as t

# data prep
import pandas as pd
import numpy as np

# for creating image vector embeddings
from PIL import Image
from img2vec_pytorch import Img2Vec

# for creating semantic (text-based) vector embeddings
from sentence_transformers import SentenceTransformer



In [86]:
# load in data and clean data types and drop null rows
metadata = pd.read_csv("./styles.csv", on_bad_lines='skip')
metadata.dropna(inplace=True)
metadata["year"] = metadata["year"].astype(int)
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44077 entries, 0 to 44423
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  44077 non-null  int64 
 1   gender              44077 non-null  object
 2   masterCategory      44077 non-null  object
 3   subCategory         44077 non-null  object
 4   articleType         44077 non-null  object
 5   baseColour          44077 non-null  object
 6   season              44077 non-null  object
 7   year                44077 non-null  int64 
 8   usage               44077 non-null  object
 9   productDisplayName  44077 non-null  object
dtypes: int64(2), object(8)
memory usage: 3.7+ MB


In [87]:
metadata["product_text"] = metadata.apply(lambda row: f"name {row['productDisplayName']} category {row['masterCategory']} subcategory {row['subCategory']} color {row['baseColour']} gender {row['gender']}".lower(), axis=1)
metadata.rename({"id":"product_id"}, inplace=True, axis=1)

metadata.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 44077 entries, 0 to 44423
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   product_id          44077 non-null  int64 
 1   gender              44077 non-null  object
 2   masterCategory      44077 non-null  object
 3   subCategory         44077 non-null  object
 4   articleType         44077 non-null  object
 5   baseColour          44077 non-null  object
 6   season              44077 non-null  object
 7   year                44077 non-null  int64 
 8   usage               44077 non-null  object
 9   productDisplayName  44077 non-null  object
 10  product_text        44077 non-null  object
dtypes: int64(2), object(9)
memory usage: 4.0+ MB


In [88]:
# check out one of the texts we will use to create semantic embeddings
metadata["product_text"][0]

'name turtle check men navy blue shirt category apparel subcategory topwear color navy blue gender men'

In [89]:
# Resnet-18 to create image embeddings
img2vec = Img2Vec(cuda=False)

# bert variant to create text embeddings
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [90]:
def get_batch(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def generate_image_vectors(products, image_base_path, batch_size=1000):
    output_dict={}

    for batch in get_batch(products, batch_size):
        product_ids = batch['product_id'].values.tolist()
        image_filenames = [image_base_path + "/" + str(_id) + ".jpg" for _id in product_ids]
        images=[]
        converted=[]

        for img_path, _id in zip(image_filenames, product_ids):
            try:
                img = Image.open(img_path).convert('RGB')
                img = img.resize((224, 224))
                images.append(img)
                converted.append(_id)
            except:
                #unable_to_convert -> skip to the next image
                continue

        #Generate vectors for all images in this batch
        vec_list = img2vec.get_vec(images)

        #update the dictionary to be returned
        batch_dict= dict(zip(converted, vec_list))
        output_dict.update(batch_dict)
        print(f"Processed {str(batch_size)} product images")

    return output_dict

def generate_text_vectors(products_df):
    text_vectors = {}
    # generate text vector
    for index, row in products_df.iterrows():
        text_vector = model.encode(row["product_text"])
        text_vectors[row["product_id"]] = text_vector.astype(np.float32)
        if index % 1000 == 0:
            print(f"Processed {str(index)} product text fields")
    return text_vectors

# combine into a single json file
def combine_vector_dicts(txt_vectors, img_vectors, products):
    product_vectors = []
    for _, row in products.iterrows():
        try:
            _id = row["product_id"]
            text_vector = txt_vectors[_id].tolist()
            img_vector = img_vectors[_id].tolist()
            vector_dict = {
                "text_vector": text_vector,
                "img_vector": img_vector,
                "product_id": _id
            }
            product_vectors.append(vector_dict)
        except KeyError:
            continue
    return product_vectors

def write_product_vector_json(vector_dict):
    product_vector_json = json.dumps(vector_dict)
    with open("./product_vectors.json", "w") as f:
        f.write(product_vector_json)

def write_product_metadata_json(metadata_df):
    products = []
    for _, row in metadata_df.iterrows():
        product = {
            "product_id": row["product_id"],
            # create a text based representation to create a semantic embedding with
            "product_metadata": {
                "name": row["productDisplayName"],
                "gender": row["gender"],
                "master_category": row["masterCategory"],
                "sub_category": row["subCategory"],
                "article_type": row["articleType"],
                "base_color": row["baseColour"],
                "season": row["season"],
                "year": row["year"],
                "usage": row["usage"]
            }
        }
        products.append(product)

    products_json = json.dumps(products)
    with open("./product_metadata.json", "w") as f:
        f.write(products_json)


In [91]:
data_path = "../app/vecsim_app/static/images"
num_products = 10000
image_vectors = generate_image_vectors(metadata[:num_products], data_path, batch_size=1000)
text_vectors = generate_text_vectors(metadata[:num_products])
vector_dict = combine_vector_dicts(text_vectors, image_vectors, metadata)
write_product_vector_json(vector_dict)
write_product_metadata_json(metadata[:num_products])



Processed 1000 product images
Processed 1000 product images
Processed 1000 product images
Processed 1000 product images
Processed 1000 product images
Processed 1000 product images
Processed 1000 product images
Processed 1000 product images
Processed 1000 product images
Processed 1000 product images
Processed 0 product text fields
Processed 1000 product text fields
Processed 2000 product text fields
Processed 3000 product text fields
Processed 4000 product text fields
Processed 5000 product text fields
Processed 6000 product text fields
Processed 7000 product text fields
Processed 8000 product text fields
Processed 9000 product text fields
Processed 10000 product text fields
