In [None]:
from tqdm import tqdm
from time import sleep

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from datasets import load_dataset
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from pinecone_text.sparse import BM25Encoder

In [None]:
import torch
import os

In [None]:
from UDCUtils import UDCUtils
utils = UDCUtils()

pinecone_api_key = utils.get_pinecone_api_key()

In [None]:
pinecone_api_key

In [None]:
pinecone = Pinecone(api_key=pinecone_api_key)
help(pinecone.delete_index)

In [None]:
index_name = utils.create_dlai_index_name("dev-002")
#print(index_name)
index_name = index_name[:-3]
#print(index_name)
if index_name in [index['name'] for index in pinecone.list_indexes()]:
    print(f"{index_name} is an existing index. Kindly delete it.")
    pinecone.delete_index(index_name)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

if device!='cuda':
    print("Sorry, cuda is not available. Proceeding ahead with CPU.")

In [None]:
pinecone.create_index(
    name=index_name,
    metric='dotproduct',
    dimension=512,
    spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

In [None]:
index = pinecone.Index(index_name)

In [None]:
help(load_dataset)

In [None]:
#pip install ipywidgets==7.7.2

In [None]:
fashion = load_dataset(
    "ashraq/fashion-product-images-small",
    split="train"
)
fashion

In [None]:
images = fashion['image']
images[0]

In [None]:
metadata = fashion.remove_columns('image')
metadata[0]

In [None]:
metadata = metadata.to_pandas()
metadata.head()

### Create sparse vector using BM25Encoder

In [None]:
bm25encoder = BM25Encoder()
bm25encoder.fit(metadata['productDisplayName'])
metadata['productDisplayName'][0]

In [None]:
bm25encoder.encode_queries(metadata['productDisplayName'][0])
bm25encoder.encode_documents(metadata['productDisplayName'][0])

### Create dense vector using CLIP

In [None]:
pip install ipywidgets==7.7.2

In [None]:
model = SentenceTransformer(
    'sentence-transformers/clip-ViT-B-32',
    device=device
)

#check the dim
clip_vector = model.encode(metadata['productDisplayName'][0])
clip_vector.shape

In [None]:
len(fashion)

In [None]:
batch_size = 100
fashion_data_num = 1000

for i in tqdm(range(0, min(fashion_data_num,len(fashion)), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(fashion))
    
    # extract metadata batch
    meta_batch = metadata.iloc[i:i_end]
    meta_dict = meta_batch.to_dict(orient="records")
    
    # concatinate all metadata field except for id and year to form a single string
    meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(['id', 'year'])].values.tolist()]
    
    # extract image batch
    img_batch = images[i:i_end]
    
    # create sparse using bm25encoder for metadata
    sparse_embeds = bm25encoder.encode_documents([text for text in meta_batch])
    
    # create dense vectors using clip model for images
    dense_embeds = model.encode(img_batch).tolist()
    
    # create unique IDs
    ids = [str(x) for x in range(i, i_end)]

    upserts = []
    # loop through the data and create dictionaries for uploading documents to pinecone index
    for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict):
        upserts.append({
            'id': _id,
            'sparse_values': sparse,
            'values': dense,
            'metadata': meta
        })
    # upload the documents to the new hybrid index
    index.upsert(upserts)

# show index description after uploading the documents
index.describe_index_stats()

In [None]:
query = "navy blue shirt for women"

sparse_vec = bm25encoder.encode_queries(query)
dense_vec = model.encode(query).tolist()

result = index.query(
    top_k=10,
    vector=dense_vec,
    sparse_vector=sparse_vec,
    include_metadata=True
)

In [None]:
imgs = []
for r in result["matches"]:
    #print(r)
    imgs.append(images[int(r["id"])])

In [None]:
from IPython.core.display import HTML
from io import BytesIO
from base64 import b64encode

# function to display product images
def display_result(image_batch):
    figures = []
    for img in image_batch:
        b = BytesIO()
        img.save(b, format='png')
        figures.append(f'''
            <figure style="margin: 5px !important;">
              <img src="data:image/png;base64,{b64encode(b.getvalue()).decode('utf-8')}" style="width: 90px; height: 120px" >
            </figure>
        ''')
    return HTML(data=f'''
        <div style="display: flex; flex-flow: row wrap; text-align: center;">
        {''.join(figures)}
        </div>
    ''')

In [None]:
display_result(imgs)

In [None]:
for r in result["matches"]:
    print(r["metadata"]["productDisplayName"])

### Scaling the hybrid search

In [None]:
def hybrid_scale(dense, sparse, alpha: float):
    """Hybrid vector scaling using a convex combination

    alpha * dense + (1 - alpha) * sparse

    Args:
        dense: Array of floats representing
        sparse: a dict of `indices` and `values`
        alpha: float between 0 and 1 where 0 == sparse only
               and 1 == dense only
    """
    if alpha < 0 or alpha > 1:
        raise ValueError("Alpha must be between 0 and 1")
    # scale sparse and dense vectors to create hybrid search vecs
    hsparse = {
        'indices': sparse['indices'],
        'values':  [v * (1 - alpha) for v in sparse['values']]
    }
    hdense = [v * alpha for v in dense]
    return hdense, hsparse

In [None]:
search_query = "navy blue shirt for women"

dense = model.encode(search_query).tolist()
sparse = bm25encoder.encode_queries(search_query)

#alpha is at extreme level, for a very dense vector
hdense, hsparse = hybrid_scale(dense, sparse, 1)

response = index.query(
    top_k=3,
    vector=hdense,
    sparse_vector=hsparse,
    include_metadata=True
)

In [None]:
imgs = []
for res in response["matches"]:
    imgs.append(images[int(res["id"])])

In [None]:
len(imgs)

In [None]:
display_result(imgs)

In [None]:
for r in response['matches']:
    print(r["metadata"]["productDisplayName"])

### More sparse hybrid search

In [None]:
hdense, hsparse = hybrid_scale(dense, sparse, alpha=0)

response = index.query(
    top_k=3,
    vector=dense,
    sparse_vector=sparse,
    include_metadata=True
)

In [None]:
imgs = [images[int(r["id"])] for r in response["matches"]]

In [None]:
display_result(imgs)

In [None]:
for r in response["matches"]:
    print(r["metadata"]["productDisplayName"])

### experiment with value of alpha

In [None]:
hdense, hsparse = hybrid_scale(dense, sparse, alpha=0.5)

response = index.query(
    top_k=10,
    vector=dense,
    sparse_vector=sparse,
    include_metadata=True
)

In [None]:
imgs = [images[int(r["id"])] for r in response["matches"]]
display_result(imgs)

In [None]:
for r in response["matches"]:
    print(f"{r["score"]}: {r["metadata"]["productDisplayName"]}")