In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import TypedDict, Literal, List, Union
import os
import json
import re


from langchain_core.tools import tool
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.messages import HumanMessage, SystemMessage,ToolMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from dotenv import load_dotenv
from tqdm import tqdm
from langchain_core.documents import Document
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv("./.env")

True

### Langchain Engines

In [3]:
class LangchainJSONEngine:
    def __init__(self, sampleBaseModel: BaseModel, systemPromptText: str=None, humanPromptText: str=None):
        self.llm = llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
        self.structured_llm = llm.with_structured_output(sampleBaseModel)
        
        if systemPromptText is None:
            self.systemPromptText = """
            You are an AI assistant. You are helping a user with a task. The user is asking you questions and you are answering them.
            """
        else:
            self.systemPromptText = systemPromptText

        if humanPromptText is None:
            self.HumanPromptText = """
            Human: {query}
            """
        else:
            self.humanPromptText = humanPromptText

        self.prompt = ChatPromptTemplate.from_messages(
            [("system", self.systemPromptText), ("human", "Query:\n\n {query}")])
        
        self.micro_agent = self.prompt | self.structured_llm

    def run(self, query: str):
        result = self.micro_agent.invoke({
            "query": query
        }) 
        return result
    

class LangchainSimpleEngine:
    def __init__(self, tools:List[tool]=[], systemPromptText: str=None, humanPromptText: str=None):
        self.llm = llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
        self.tools = tools
        
        if len(tools) == 0:
            self.llm_with_tools = llm
        else:
            self.llm_with_tools = llm.bind_tools(tools)
            
        if systemPromptText is None:
            self.systemPromptText = """
            You are an AI assistant. You are helping a user with a task. The user is asking you questions and you are answering them.
            """
        else:
            self.systemPromptText = systemPromptText

        if humanPromptText is not None: 
            print("Skipping human prompt text ...")

    def run(self, query: str):
        messages = [
            SystemMessage(self.systemPromptText),
            HumanMessage(content=query)
        ]
        level1_result = self.llm_with_tools.invoke(messages)
        if len(level1_result.tool_calls) == 0:
            print("No tools to run ...")
            return level1_result
        else:
            print("Running tools ...")
            for tool_call in level1_result.tool_calls:
                tool_output = tool_call.invoke()
                messages.append(ToolMessage(tool_output, tool_call_id=tool_call["id"]))
            level2_result = self.llm_with_tools.invoke(messages)
            return level2_result

### Get Stored Result

In [4]:
def get_stored_result(path:str, type:Literal["json","csv"]):
    # Check if the path is valid
    if not os.path.exists(path):
        return None 
    if type == "json":
        return pd.read_json(path).to_dict()
    elif type == "csv":
        with open(path, 'r') as f:
            data = json.load(f)
        return data
    else:
        return None 
    
def save_result(result, path:str, type:Literal["json","csv"]):
    if type == "json":
        with open(path, 'w') as f:
            json.dump(result, f, indent=4)
    elif type == "csv":
        #  check if the result is a dataframe
        if isinstance(result, pd.DataFrame):
            result.to_csv(path, index=False)
        else:
            print("The result is not a dataframe")
            raise ValueError("The result is not a dataframe")
    else:
        print("Invalid type")
        raise ValueError("Invalid type")

### Vector database

In [5]:
def connect_to_qdrant(url:str, collection_name:str, vector_name:str):
    qdrant_client = QdrantClient(url)
    vector_store = QdrantVectorStore(qdrant_client, collection_name, vector_name)
    return vector_store

### Reading Flipkart Data

In [6]:
def generate_flipkart_dataframe(path:str):
        
    flip_df_raw = pd.read_csv('./data/flipkart_com-ecommerce_sample.csv')
    # Drop rows with missing values
    flip_df = flip_df_raw.dropna()
    # Reset index
    flip_df = flip_df.reset_index(drop=True)

    # Drop columns that are not needed
    flip_df['product_category_tree'] = flip_df['product_category_tree'].apply(lambda x: list(map(lambda y: y.lower().replace('["','').replace('"]','').strip() , x.split('>>'))))

    column_names = list(flip_df.columns)
    
    first_order_categories = flip_df['product_category_tree'].apply(lambda x: x[0]).tolist()

    # print("Total categories: ", len((first_order_categories)))
    # print("Unique categories: ", len(set(first_order_categories)))
    # Category wise distribution in one line
    cat_distr = pd.Series(first_order_categories).value_counts()
    # Drop those categories which have less than 10 products
    first_order_categories_distr = cat_distr[cat_distr>10]
    # print(first_order_categories_distr)
    first_order_categories_trimmed = list(first_order_categories_distr.keys())
    # Remove the rows from the dataframe which are not in the first order categories
    flip_df = flip_df[flip_df['product_category_tree'].apply(lambda x: x[0] in first_order_categories_trimmed)]

    # Reset index
    flip_df = flip_df.reset_index(drop=True)

    return flip_df

In [7]:
flip_df = generate_flipkart_dataframe('./data/flipkart_com-ecommerce_sample.csv')

In [8]:
# save_result(flip_df, './data/flipkart_com-ecommerce_sample_cleaned.csv', "csv")

### Dividing the each first order categories into their respective requirement chart  (LLM)

In [9]:
class RequirementBaseModel(BaseModel):
    requiremets : List[str] = Field(title="Requirements", description="List of requirements", example=["price", "rating", "brand", "discount","age"])

In [10]:
def get_category_wise_data(df, stored_result_path:str, use_stored_result:bool = True):

    if use_stored_result is not None and stored_result_path is not None:
        stored_result = get_stored_result(stored_result_path, "json")
        if stored_result is not None:
            print("Using stored result ...")
            return stored_result

    unique_categories = list(df['product_category_tree'].apply(lambda x: x[0]).unique())

    
    requirement_engine = LangchainJSONEngine(
            sampleBaseModel=RequirementBaseModel,
            systemPromptText=
            """You are an AI assistant. 
            Your task is to determine all possible requirements for a product category. 
            For example the requirements for clothing could be [ age group ( for kids, adults etc. ) , size ( small, medium, large etc. ), color (white, fancy etc) ... ]
            Generate a list of DETAILED requirements for a product category. Atleast 10 requirements should be generated for each category.
            Don't use any indexing or numbering in the requirements.
            """
        )
    
    category_wise_requirements = {}

    for category in unique_categories:
        print(f"Running for category: {category} ...")
        requirements = requirement_engine.run(category)
        category_wise_requirements[category] = requirements.dict()['requiremets']

    if stored_result_path is not None:
        save_result(category_wise_requirements, stored_result_path, "json")

    return category_wise_requirements

In [11]:
category_wise_requirements = get_category_wise_data(flip_df, stored_result_path="./stored-result/category_wise_requirements.json")

Using stored result ...


In [12]:
category_wise_requirements['bags, wallets & belts']

{0: 'Material (leather, fabric, synthetic)',
 1: 'Type (tote, crossbody, clutch, wallet, belt)',
 2: 'Color (black, brown, tan, colorful)',
 3: 'Size (small, medium, large)',
 4: 'Closure (zipper, magnetic, buckle)',
 5: 'Brand (designer, luxury, casual)',
 6: 'Compartments (single, multiple)',
 7: 'Strap type (adjustable, non-adjustable)',
 8: 'Pattern (solid, printed, textured)',
 9: 'Features (RFID protection, detachable strap, card slots)'}

In [13]:
category_wise_requirements.keys()

dict_keys(['clothing', 'furniture', 'footwear', 'pet supplies', 'pens & stationery', 'bags, wallets & belts', 'home decor & festive needs', 'automotive', 'tools & hardware', 'sports & fitness', 'home furnishing', 'baby care', 'mobiles & accessories', 'toys & school supplies', 'jewellery', 'kitchen & dining', 'beauty and personal care', 'home & kitchen', 'computers', 'watches', 'cameras & accessories', 'health & personal care appliances', 'gaming', 'home improvement', 'sunglasses', 'home entertainment'])

## Group Proucts based on their categories

In [14]:
def group_by_category(flip_df, category_wise_requirements, stored_result_path:str, use_stored_result:bool = True):
    """
    - Clothing 
        - Age group : [ PID1, PID2, PID3 ... ]
        - Size: [ PID11, PID12, PID13 ... ]
        - Color: 
        - Material : 
        ...
    - Footwear
    - Watches
    """

    if use_stored_result is not None and stored_result_path is not None:
        stored_result = get_stored_result(stored_result_path, "json")
        if stored_result is not None:
            print("Using stored result ...")
            return stored_result

    category_wise_data = {}

    for category in category_wise_requirements.keys():
        print(f"Running for category: {category} ...")
        category_data = {}
        for requirement in category_wise_requirements[category]:
            print(f"Running for requirement: {requirement} ...")
            requirement_engine = LangchainSimpleEngine(
                tools=[OpenAIEmbeddings()],
                systemPromptText=
                f"""
                You are an AI assistant. 
                Your task is to group the products in the category {category} based on the requirement {requirement}.
                """
            )

            result = requirement_engine.run(requirement)
            category_data[requirement] = result.dict()['output']

        category_wise_data[category] = category_data

    if stored_result_path is not None:
        save_result(category_wise_data, stored_result_path, "json")

    return category_wise_data

## Generating Descriptions

In [15]:
def generate_compact_description(df):
    # df['first_order_category'] = df['product_category_tree'].apply(lambda x: x[0])
    df['first_order_category'] = df['product_category_tree'].apply(lambda x: re.sub(r'[^a-zA-Z]+', '_', x[0]))
    df['compact_description'] = "Product Name: " + df['product_name'] 
    df['compact_description'] += "\nBrand: " + df['brand']
    df['compact_description'] += "\nCategory: " + df['product_category_tree'].apply(lambda x: '->'.join(x)) 
    df['compact_description'] += "\nDescription: " + df['description']
    df['compact_description'] += "\nSpecification: " + df['product_specifications'].apply(lambda x: x.replace('"key"=>','').replace('"value"=>','').replace('=>',':').replace('"product_specification":',''))
    return df

In [16]:
flip_df = generate_compact_description(flip_df)

In [17]:
print(flip_df['first_order_category'])

0            clothing
1           furniture
2            footwear
3            clothing
4        pet_supplies
             ...     
13714       baby_care
13715       baby_care
13716       baby_care
13717       baby_care
13718       baby_care
Name: first_order_category, Length: 13719, dtype: object


### Converting CSV data into JSON

In [18]:
def conver_df_to_dict(df):
    data = {}
    for index, row in df.iterrows():
        row['image'] = json.loads(row['image'])
        data[row['uniq_id']] = row.to_dict()
    return data

In [19]:
dict_data = conver_df_to_dict(flip_df)

In [20]:
# save_result(dict_data, './data/flipkart_com-ecommerce_sample_cleaned_dict2.json', "json")

### Catgory wise description generation

In [21]:
def generate_category_wise_product_metadata(flip_df):
    # umique first order categories
    unique_categories = list(flip_df['first_order_category'].unique())
    category_wise_metadata = {}
    stored_category_wise_documents = {}
    for category in unique_categories:
        # those rows which belong to the category
        category_df = flip_df[flip_df['first_order_category'] == category]
        # create a document for each product with metadata {"retail_price": , "discounted_price", "uniq_id" } and page_content is the compact_description
        documents = []
        compact_descriptions = []
        uuids = []

        for index, row in category_df.iterrows():
            metadata = {"retail_price": row['retail_price'], "discounted_price": row['discounted_price'], "uniq_id": row['uniq_id']}
            documents.append(Document(page_content=row['compact_description'], metadata=metadata))
            uuids.append(row['uniq_id'])
            compact_descriptions.append([row['compact_description'],metadata])
        
        category_wise_metadata[category] = {"documents": documents, "uuids": uuids}
        stored_category_wise_documents[category] = compact_descriptions

    save_result(stored_category_wise_documents, "./stored-result/category_wise_documents.json", "json")

    return category_wise_metadata

In [22]:
category_wise_metadata = generate_category_wise_product_metadata(flip_df)

In [19]:
def qdrant_db_handler(category_wise_metadata,BATCH_SIZE=64, clip_data_count=None, reset_collections=False):
    dense_embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    # sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
    
    client = QdrantClient(url=os.getenv("QDRANT_URL"),api_key=os.getenv("QDRANT_API_KEY"))

    if reset_collections:
        for category in category_wise_metadata.keys():
            print(f"Deleting collection for category: {category} ...")
            flag = client.delete_collection(collection_name=category)
            print(f"Collection deleted: {flag}")
            print(f"Creating collection for category: {category} ...")
            try:
                client.create_collection(
                    collection_name=category,
                    vectors_config=VectorParams(size=3072, distance=Distance.COSINE),
                )
            except Exception as e:
                print(e)
                print("Collection already exists ...")
    
    for category in category_wise_metadata.keys():
        print(f"Adding documents for category: {category} ...")
        docs = category_wise_metadata[category]["documents"][0:clip_data_count]
        uuids = category_wise_metadata[category]["uuids"][0:clip_data_count]

        # upsert batch by batch

        qvector_store = QdrantVectorStore(
                client=client,
                collection_name=category,
                embedding=dense_embeddings,
            )

    
        for i in tqdm(range(0, len(docs), BATCH_SIZE)):
            try:
                qvector_store.add_documents(documents=docs[i:i+BATCH_SIZE], ids=uuids[i:i+BATCH_SIZE])
            except Exception as e:
                print(e)
                print("Retrying after 3 seconds ...")
                # Add a delay of 3 seconds
                time.sleep(3)
                i-=BATCH_SIZE
    
    return client

In [20]:
category_wise_metadata.keys()

dict_keys(['clothing', 'furniture', 'footwear', 'pet_supplies', 'pens_stationery', 'bags_wallets_belts', 'home_decor_festive_needs', 'automotive', 'tools_hardware', 'sports_fitness', 'home_furnishing', 'baby_care', 'mobiles_accessories', 'toys_school_supplies', 'jewellery', 'kitchen_dining', 'beauty_and_personal_care', 'home_kitchen', 'computers', 'watches', 'cameras_accessories', 'health_personal_care_appliances', 'gaming', 'home_improvement', 'sunglasses', 'home_entertainment'])

In [21]:
# qclient = qdrant_db_handler(category_wise_metadata, BATCH_SIZE=16, clip_data_count=None)

Adding documents for category: clothing ...


 53%|█████▎    | 104/196 [03:56<03:33,  2.32s/it]

The write operation timed out
Retrying after 3 seconds ...


 72%|███████▏  | 141/196 [05:32<02:08,  2.33s/it]

The write operation timed out
Retrying after 3 seconds ...


 76%|███████▌  | 149/196 [06:03<02:25,  3.10s/it]

The write operation timed out
Retrying after 3 seconds ...


 77%|███████▋  | 150/196 [06:13<03:54,  5.09s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 196/196 [08:17<00:00,  2.54s/it]


Adding documents for category: furniture ...


100%|██████████| 12/12 [00:30<00:00,  2.58s/it]


Adding documents for category: footwear ...


 46%|████▌     | 11/24 [00:37<00:53,  4.11s/it]

The write operation timed out
Retrying after 3 seconds ...


 50%|█████     | 12/24 [00:47<01:09,  5.81s/it]

The write operation timed out
Retrying after 3 seconds ...


 71%|███████   | 17/24 [01:19<00:40,  5.84s/it]

The write operation timed out
Retrying after 3 seconds ...


 75%|███████▌  | 18/24 [01:29<00:42,  7.02s/it]

The write operation timed out
Retrying after 3 seconds ...


 88%|████████▊ | 21/24 [01:52<00:21,  7.17s/it]

The write operation timed out
Retrying after 3 seconds ...


 92%|█████████▏| 22/24 [02:02<00:15,  7.94s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 24/24 [02:15<00:00,  5.66s/it]


Adding documents for category: pet_supplies ...


100%|██████████| 2/2 [00:04<00:00,  2.09s/it]


Adding documents for category: pens_stationery ...


100%|██████████| 11/11 [00:22<00:00,  2.09s/it]


Adding documents for category: bags_wallets_belts ...


100%|██████████| 10/10 [00:24<00:00,  2.46s/it]


Adding documents for category: home_decor_festive_needs ...


 43%|████▎     | 23/54 [00:49<01:05,  2.10s/it]

The write operation timed out
Retrying after 3 seconds ...


 50%|█████     | 27/54 [01:14<02:09,  4.81s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 54/54 [02:23<00:00,  2.66s/it]


Adding documents for category: automotive ...


 11%|█         | 7/64 [00:17<02:15,  2.38s/it]

The write operation timed out
Retrying after 3 seconds ...


 12%|█▎        | 8/64 [00:27<04:22,  4.68s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 64/64 [02:43<00:00,  2.56s/it]


Adding documents for category: tools_hardware ...


 28%|██▊       | 7/25 [00:15<00:40,  2.27s/it]

The write operation timed out
Retrying after 3 seconds ...


 64%|██████▍   | 16/25 [00:44<00:21,  2.41s/it]

The write operation timed out
Retrying after 3 seconds ...


 72%|███████▏  | 18/25 [01:02<00:40,  5.76s/it]

The write operation timed out
Retrying after 3 seconds ...


 76%|███████▌  | 19/25 [01:12<00:41,  6.93s/it]

The write operation timed out
Retrying after 3 seconds ...


 80%|████████  | 20/25 [01:22<00:39,  7.90s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 25/25 [01:42<00:00,  4.11s/it]


Adding documents for category: sports_fitness ...


100%|██████████| 7/7 [00:15<00:00,  2.23s/it]


Adding documents for category: home_furnishing ...


100%|██████████| 44/44 [01:35<00:00,  2.17s/it]


Adding documents for category: baby_care ...


100%|██████████| 29/29 [01:04<00:00,  2.23s/it]


Adding documents for category: mobiles_accessories ...


100%|██████████| 69/69 [02:30<00:00,  2.19s/it]


Adding documents for category: toys_school_supplies ...


100%|██████████| 7/7 [00:14<00:00,  2.10s/it]


Adding documents for category: jewellery ...


  4%|▍         | 9/221 [00:30<17:01,  4.82s/it]

The write operation timed out
Retrying after 3 seconds ...


 54%|█████▍    | 119/221 [04:54<06:50,  4.02s/it]

The write operation timed out
Retrying after 3 seconds ...


 54%|█████▍    | 120/221 [05:03<09:40,  5.75s/it]

The write operation timed out
Retrying after 3 seconds ...


 64%|██████▍   | 142/221 [06:21<02:51,  2.17s/it]

The write operation timed out
Retrying after 3 seconds ...


 65%|██████▍   | 143/221 [06:35<07:33,  5.81s/it]

The write operation timed out
Retrying after 3 seconds ...


 66%|██████▌   | 145/221 [06:52<08:58,  7.09s/it]

The write operation timed out
Retrying after 3 seconds ...


 75%|███████▌  | 166/221 [07:49<02:30,  2.73s/it]

The write operation timed out
Retrying after 3 seconds ...


 81%|████████  | 178/221 [08:37<02:36,  3.65s/it]

The write operation timed out
Retrying after 3 seconds ...


 81%|████████  | 179/221 [08:46<03:49,  5.45s/it]

The write operation timed out
Retrying after 3 seconds ...


 82%|████████▏ | 182/221 [09:03<03:07,  4.82s/it]

The write operation timed out
Retrying after 3 seconds ...


 83%|████████▎ | 184/221 [09:25<04:40,  7.59s/it]

The write operation timed out
Retrying after 3 seconds ...


 87%|████████▋ | 192/221 [10:21<03:38,  7.53s/it]

The write operation timed out
Retrying after 3 seconds ...


 97%|█████████▋| 215/221 [11:21<00:13,  2.25s/it]

The write operation timed out
Retrying after 3 seconds ...


 99%|█████████▊| 218/221 [11:37<00:11,  3.77s/it]

The write operation timed out
Retrying after 3 seconds ...


 99%|█████████▉| 219/221 [11:47<00:11,  5.63s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 221/221 [12:00<00:00,  3.26s/it]


Adding documents for category: kitchen_dining ...


  0%|          | 0/23 [00:00<?, ?it/s]

The write operation timed out
Retrying after 3 seconds ...


  4%|▍         | 1/23 [00:09<03:33,  9.70s/it]

The write operation timed out
Retrying after 3 seconds ...


  9%|▊         | 2/23 [00:20<03:33, 10.18s/it]

The write operation timed out
Retrying after 3 seconds ...


 13%|█▎        | 3/23 [00:30<03:24, 10.25s/it]

The write operation timed out
Retrying after 3 seconds ...


 17%|█▋        | 4/23 [00:40<03:14, 10.22s/it]

The write operation timed out
Retrying after 3 seconds ...


 22%|██▏       | 5/23 [00:51<03:04, 10.24s/it]

The write operation timed out
Retrying after 3 seconds ...


 30%|███       | 7/23 [01:05<02:11,  8.19s/it]

The write operation timed out
Retrying after 3 seconds ...


 35%|███▍      | 8/23 [01:14<02:10,  8.71s/it]

The write operation timed out
Retrying after 3 seconds ...


 61%|██████    | 14/23 [01:49<00:49,  5.49s/it]

The write operation timed out
Retrying after 3 seconds ...


 65%|██████▌   | 15/23 [01:59<00:54,  6.76s/it]

The write operation timed out
Retrying after 3 seconds ...


 74%|███████▍  | 17/23 [02:13<00:40,  6.68s/it]

The write operation timed out
Retrying after 3 seconds ...


 78%|███████▊  | 18/23 [02:23<00:38,  7.61s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 23/23 [02:54<00:00,  7.57s/it]


Adding documents for category: beauty_and_personal_care ...


  0%|          | 0/10 [00:00<?, ?it/s]

The write operation timed out
Retrying after 3 seconds ...


 30%|███       | 3/10 [00:21<00:47,  6.83s/it]

The write operation timed out
Retrying after 3 seconds ...


 40%|████      | 4/10 [00:31<00:47,  7.95s/it]

The write operation timed out
Retrying after 3 seconds ...


 80%|████████  | 8/10 [01:01<00:14,  7.42s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 10/10 [01:15<00:00,  7.52s/it]


Adding documents for category: home_kitchen ...


100%|██████████| 2/2 [00:04<00:00,  2.26s/it]


Adding documents for category: computers ...


 67%|██████▋   | 24/36 [00:57<00:33,  2.77s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 36/36 [01:33<00:00,  2.61s/it]


Adding documents for category: watches ...


  0%|          | 0/3 [00:00<?, ?it/s]

The write operation timed out
Retrying after 3 seconds ...


 33%|███▎      | 1/3 [00:09<00:19,  9.75s/it]

The write operation timed out
Retrying after 3 seconds ...


 67%|██████▋   | 2/3 [00:20<00:10, 10.08s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 3/3 [00:30<00:00, 10.17s/it]


Adding documents for category: cameras_accessories ...


  0%|          | 0/5 [00:00<?, ?it/s]

The write operation timed out
Retrying after 3 seconds ...


 40%|████      | 2/5 [00:14<00:21,  7.07s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 5/5 [00:30<00:00,  6.12s/it]


Adding documents for category: health_personal_care_appliances ...


 67%|██████▋   | 2/3 [00:07<00:03,  3.62s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 3/3 [00:16<00:00,  5.58s/it]


Adding documents for category: gaming ...


  0%|          | 0/3 [00:00<?, ?it/s]

The write operation timed out
Retrying after 3 seconds ...


 33%|███▎      | 1/3 [00:09<00:19,  9.73s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 3/3 [00:22<00:00,  7.51s/it]


Adding documents for category: home_improvement ...


  0%|          | 0/5 [00:00<?, ?it/s]

The write operation timed out
Retrying after 3 seconds ...


 40%|████      | 2/5 [00:18<00:26,  8.87s/it]

The write operation timed out
Retrying after 3 seconds ...


100%|██████████| 5/5 [00:35<00:00,  7.11s/it]


Adding documents for category: sunglasses ...


100%|██████████| 3/3 [00:08<00:00,  2.93s/it]


Adding documents for category: home_entertainment ...


100%|██████████| 2/2 [00:04<00:00,  2.46s/it]


In [23]:
def get_category_wise_vector_store():
    client = QdrantClient(url=os.getenv("QDRANT_URL"),api_key=os.getenv("QDRANT_API_KEY"))
    collections_dict = client.get_collections().dict()
    all_collections = [collection['name'] for collection in collections_dict['collections']]
    category_wise_vector_store = {}
    for category in all_collections:
        category_wise_vector_store[category] = QdrantVectorStore(
            client=client,
            collection_name=category,
            embedding=OpenAIEmbeddings(model="text-embedding-3-large"),
        )
    return category_wise_vector_store

In [24]:
category_wise_vector_store = get_category_wise_vector_store()

In [36]:
def sort_by_price_relevance(product_doc_list, price_lower_bound, price_upper_bound):
    product_doc_list = product_doc_list.copy()
    # Calculate the midpoint of the price range
    price_midpoint = (price_lower_bound + price_upper_bound) / 2
    def relevance(product_doc):
        price = product_doc.metadata['discounted_price']
        return abs(price - price_midpoint)
    sorted_product_doc_list = sorted(product_doc_list, key=relevance)
    # sorted_product_doc_list.reverse()

    return sorted_product_doc_list

In [54]:
docs = category_wise_vector_store["clothing"].similarity_search("I want Puma and adidas shoes. I can also wear it in summer",k=10)

In [55]:
docs

[Document(metadata={'discounted_price': 1209.0, 'retail_price': 2199.0, 'uniq_id': '086e9bc94fc4f43801db2be4a3d0e5ee', '_id': '086e9bc9-4fc4-f438-01db-2be4a3d0e5ee', '_collection_name': 'clothing'}, page_content='Product Name: Puma Men\'s Striped Casual Shirt\nBrand: Slim Fit\nCategory: clothing->men\'s clothing->shirts->casual & party wear shirts->puma casual & party wear shirts\nDescription: Puma Men\'s Striped Casual Shirt - Buy Greenery White Puma Men\'s Striped Casual Shirt For Only Rs. 2199 Online in India. Shop Online For Apparels. Huge Collection of Branded Clothes Only at Flipkart.com\nSpecification: {[{"Pattern", "Striped"}, {"Occasion", "Casual"}, {"Ideal For", "Men\'s"}, {"Pleats", "Pleats Below Yoke at the Back"}, {"Sleeve", "Full Sleeve"}, {"Brand Fit", "Slim Fit"}, {"Fabric", "100% Cotton"}, {"Style", "Epaulettes, Set-in Pocket with Flap"}, {"Placket", "Cut and Sew Placket"}, {"Fit", "Slim"}, {"Hem", "Round Hem"}, {"Cuff", "Square Cuffs"}, {"Design", "Embroidery at Back"

In [56]:
sort_by_price_relevance(docs, 10000, 12132)

[Document(metadata={'discounted_price': 1949.0, 'retail_price': 2999.0, 'uniq_id': 'e5a5956dc69034776fa093f891ff9a44', '_id': 'e5a5956d-c690-3477-6fa0-93f891ff9a44', '_collection_name': 'clothing'}, page_content='Product Name: Adidas Solid Men\'s Track Top\nBrand: Adidas\nCategory: clothing->men\'s clothing->sports wear->track tops->adidas track tops->adidas solid men\'s track top\nDescription: Specifications of Adidas Solid Men\'s Track Top Track Top Details Sleeve Full Sleeve Closure Zipper Fabric Cotton, Polyester Pockets Kangaroo Pockets at Front Design Logo on Chest General Details Pattern Solid Ideal For Men\'s Occasion Sports Fabric Care Machine Wash as per Tag, Do not Dry Clean, Do not Bleach, Iron Steam or Dry as per Tag Additional Details Other Details Hooded Style Code ESS PREMFZ HOOD BLUE\nSpecification: {[{"Sleeve", "Full Sleeve"}, {"Closure", "Zipper"}, {"Fabric", "Cotton, Polyester"}, {"Pockets", "Kangaroo Pockets at Front"}, {"Design", "Logo on Chest"}, {"Pattern", "Sol