## Importing Necessary Libraries

In [1]:
# !pip install chromadb

In [2]:
import numpy as np
import pandas as pd
import ast
import os
# Set options to display all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# pd.set_option('display.max_colwidth', None)

from sentence_transformers import SentenceTransformer, util, CrossEncoder
from pathlib import Path

from operator import itemgetter

# Import the OpenAI Embedding Function into chroma

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

## Mounting the notebook to Google Drive and loading the Myntra Products Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
input_path = '/content/drive/My Drive/FashionRAG'

In [5]:
file_path = os.path.join(input_path, 'Fashion Dataset v2.csv')
print(file_path)

/content/drive/My Drive/FashionRAG/Fashion Dataset v2.csv


In [6]:
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,p_id,name,products,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes
0,17048614,Khushal K Women Black Ethnic Motifs Printed Ku...,"Kurta, Palazzos, Dupatta",5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
1,16524740,InWeave Women Orange Solid Kurta with Palazzos...,"Kurta, Palazzos, Floral Print Dupatta",5899.0,Orange,InWeave,http://assets.myntassets.com/assets/images/165...,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32..."
2,16331376,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,"Kurta, Trousers, Dupatta",4899.0,Navy Blue,Anubhutee,http://assets.myntassets.com/assets/images/163...,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
3,14709966,Nayo Women Red Floral Printed Kurta With Trous...,"Kurta, Trouser, Dupatta",3699.0,Red,Nayo,http://assets.myntassets.com/assets/images/147...,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ..."
4,11056154,AHIKA Women Black & Green Printed Straight Kurta,Kurta,1350.0,Black,AHIKA,http://assets.myntassets.com/assets/images/110...,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size..."


In [7]:
df.shape

(14214, 11)

## Creating one single column with all relevant details

In [8]:
# Step 4: Fill NaNs to avoid concatenation issues
df['products'] = df['products'].fillna("unknown")
df['ratingCount'] = df['ratingCount'].fillna(0)
df['avg_rating'] = df['avg_rating'].fillna(0.0)
df['description'] = df['description'].fillna("")

# Step 5: Combine columns into one string per row
def combine_columns(row):
    return f"Category: {row['products']}. " \
           f"Rating Count: {row['ratingCount']}. " \
           f"Average Rating: {row['avg_rating']}. " \
           f"Price: {row['price']}. " \
           f"colour: {row['colour']}. " \
           f"Brand: {row['brand']}. "\
           f"img: {row['img']}. "\
           f"attributes: {row['p_attributes']}. "\
           f"Description: {row['description']}"

df['combined_text'] = df.apply(combine_columns, axis=1)
df.head()

Unnamed: 0,p_id,name,products,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes,combined_text
0,17048614,Khushal K Women Black Ethnic Motifs Printed Ku...,"Kurta, Palazzos, Dupatta",5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...","Category: Kurta, Palazzos, Dupatta. Rating Cou..."
1,16524740,InWeave Women Orange Solid Kurta with Palazzos...,"Kurta, Palazzos, Floral Print Dupatta",5899.0,Orange,InWeave,http://assets.myntassets.com/assets/images/165...,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...","Category: Kurta, Palazzos, Floral Print Dupatt..."
2,16331376,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,"Kurta, Trousers, Dupatta",4899.0,Navy Blue,Anubhutee,http://assets.myntassets.com/assets/images/163...,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...","Category: Kurta, Trousers, Dupatta. Rating Cou..."
3,14709966,Nayo Women Red Floral Printed Kurta With Trous...,"Kurta, Trouser, Dupatta",3699.0,Red,Nayo,http://assets.myntassets.com/assets/images/147...,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...","Category: Kurta, Trouser, Dupatta. Rating Coun..."
4,11056154,AHIKA Women Black & Green Printed Straight Kurta,Kurta,1350.0,Black,AHIKA,http://assets.myntassets.com/assets/images/110...,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size...",Category: Kurta. Rating Count: 21274.0. Averag...


## Creating the embeddings of the combined text column

In [9]:
# Step 6: Load the Sentence Transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 7: Generate embeddings
embeddings = model.encode(df['combined_text'].tolist(), show_progress_bar=True, batch_size=64, convert_to_tensor=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/223 [00:00<?, ?it/s]

In [10]:
df['Embeddings'] = embeddings.tolist()

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14214 entries, 0 to 14213
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   p_id           14214 non-null  int64  
 1   name           14214 non-null  object 
 2   products       14214 non-null  object 
 3   price          14214 non-null  float64
 4   colour         14214 non-null  object 
 5   brand          14214 non-null  object 
 6   img            14214 non-null  object 
 7   ratingCount    14214 non-null  float64
 8   avg_rating     14214 non-null  float64
 9   description    14214 non-null  object 
 10  p_attributes   14214 non-null  object 
 11  combined_text  14214 non-null  object 
 12  Embeddings     14214 non-null  object 
dtypes: float64(3), int64(1), object(9)
memory usage: 1.4+ MB


In [12]:
df.head()

Unnamed: 0,p_id,name,products,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes,combined_text,Embeddings
0,17048614,Khushal K Women Black Ethnic Motifs Printed Ku...,"Kurta, Palazzos, Dupatta",5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...","Category: Kurta, Palazzos, Dupatta. Rating Cou...","[-0.07335633039474487, 0.0728280320763588, -0...."
1,16524740,InWeave Women Orange Solid Kurta with Palazzos...,"Kurta, Palazzos, Floral Print Dupatta",5899.0,Orange,InWeave,http://assets.myntassets.com/assets/images/165...,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...","Category: Kurta, Palazzos, Floral Print Dupatt...","[-0.05457332730293274, 0.04965370520949364, -0..."
2,16331376,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,"Kurta, Trousers, Dupatta",4899.0,Navy Blue,Anubhutee,http://assets.myntassets.com/assets/images/163...,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...","Category: Kurta, Trousers, Dupatta. Rating Cou...","[-0.07413442432880402, 0.05554481968283653, -0..."
3,14709966,Nayo Women Red Floral Printed Kurta With Trous...,"Kurta, Trouser, Dupatta",3699.0,Red,Nayo,http://assets.myntassets.com/assets/images/147...,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...","Category: Kurta, Trouser, Dupatta. Rating Coun...","[-0.07570140063762665, 0.0409795418381691, -0...."
4,11056154,AHIKA Women Black & Green Printed Straight Kurta,Kurta,1350.0,Black,AHIKA,http://assets.myntassets.com/assets/images/110...,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size...",Category: Kurta. Rating Count: 21274.0. Averag...,"[-0.045413266867399216, 0.026124484837055206, ..."


## Storing the embeddings file for re use/ ease of use

In [13]:
output_path = '/content/drive/My Drive/FashionRAG/myntra_embeddings.csv'

In [14]:
df['Embeddings'] = df['Embeddings'].apply(lambda x: ','.join(map(str, x)))
df.to_csv(output_path, index=False)

## Loading the final file with embeddings

In [15]:
df_embeddings = pd.read_csv(output_path)
df_embeddings.head()

Unnamed: 0,p_id,name,products,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes,combined_text,Embeddings
0,17048614,Khushal K Women Black Ethnic Motifs Printed Ku...,"Kurta, Palazzos, Dupatta",5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...","Category: Kurta, Palazzos, Dupatta. Rating Cou...","-0.07335633039474487,0.0728280320763588,-0.038..."
1,16524740,InWeave Women Orange Solid Kurta with Palazzos...,"Kurta, Palazzos, Floral Print Dupatta",5899.0,Orange,InWeave,http://assets.myntassets.com/assets/images/165...,1081.0,4.119334,Orange solid Kurta with Palazzos with dupatta<...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...","Category: Kurta, Palazzos, Floral Print Dupatt...","-0.05457332730293274,0.04965370520949364,-0.00..."
2,16331376,Anubhutee Women Navy Blue Ethnic Motifs Embroi...,"Kurta, Trousers, Dupatta",4899.0,Navy Blue,Anubhutee,http://assets.myntassets.com/assets/images/163...,1752.0,4.16153,Navy blue embroidered Kurta with Trousers with...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...","Category: Kurta, Trousers, Dupatta. Rating Cou...","-0.07413442432880402,0.05554481968283653,-0.01..."
3,14709966,Nayo Women Red Floral Printed Kurta With Trous...,"Kurta, Trouser, Dupatta",3699.0,Red,Nayo,http://assets.myntassets.com/assets/images/147...,4113.0,4.088986,Red printed kurta with trouser and dupatta<br>...,"{'Add-Ons': 'NA', 'Body Shape ID': '333,424', ...","Category: Kurta, Trouser, Dupatta. Rating Coun...","-0.07570140063762665,0.0409795418381691,-0.040..."
4,11056154,AHIKA Women Black & Green Printed Straight Kurta,Kurta,1350.0,Black,AHIKA,http://assets.myntassets.com/assets/images/110...,21274.0,3.978377,"Black and green printed straight kurta, has a ...","{'Body Shape ID': '424', 'Body or Garment Size...",Category: Kurta. Rating Count: 21274.0. Averag...,"-0.045413266867399216,0.026124484837055206,-0...."


## Converting the Embeddings column into required format inside a list

In [16]:
df_embeddings['Embeddings'] = df_embeddings['Embeddings'].apply(lambda x: list(ast.literal_eval(x)))
type(df_embeddings['Embeddings'][0])

list

In [17]:
df_embeddings.head(1)

Unnamed: 0,p_id,name,products,price,colour,brand,img,ratingCount,avg_rating,description,p_attributes,combined_text,Embeddings
0,17048614,Khushal K Women Black Ethnic Motifs Printed Ku...,"Kurta, Palazzos, Dupatta",5099.0,Black,Khushal K,http://assets.myntassets.com/assets/images/170...,4522.0,4.418399,Black printed Kurta with Palazzos with dupatta...,"{'Add-Ons': 'NA', 'Body Shape ID': '443,333,32...","Category: Kurta, Palazzos, Dupatta. Rating Cou...","[-0.07335633039474487, 0.0728280320763588, -0...."


## Checking if the embeddings are coming properly

In [18]:
# df_embeddings["Embeddings"][0]

## Initiating ChromaDB Client for vector database

In [19]:
import chromadb
client = chromadb.Client()

## Creating the collection - database

In [20]:

chroma_client = chromadb.PersistentClient(path = input_path)
chroma_client.delete_collection(name="Semantic_myntra_collection_chromadb")
collection = chroma_client.create_collection(name="Semantic_myntra_collection_chromadb")

## Adding the embeddings along with the combined text to the vector database

In [21]:
batch_size = 5000  # You can adjust this based on the max batch size
num_batches = len(df_embeddings) // batch_size + (len(df_embeddings) % batch_size != 0)

for i in range(num_batches):
    start_index = i * batch_size
    end_index = min((i + 1) * batch_size, len(df_embeddings))

    batch_embeddings = df_embeddings["Embeddings"][start_index:end_index].tolist()
    batch_documents = df_embeddings["combined_text"][start_index:end_index].tolist()
    batch_ids = df_embeddings["p_id"][start_index:end_index].astype(str).tolist()

    collection.add(
        embeddings=batch_embeddings,
        documents=batch_documents,
        ids=batch_ids
    )

print("Data added to ChromaDB collection successfully.")

Data added to ChromaDB collection successfully.


## Checking if the output is coming correctly

In [22]:
collection.peek()

{'ids': ['17048614',
  '16524740',
  '16331376',
  '14709966',
  '11056154',
  '18704418',
  '14046594',
  '14951330',
  '13791594',
  '17048604'],
 'embeddings': array([[-0.07335633,  0.07282803, -0.03819431, ..., -0.07931599,
         -0.01389913,  0.00252558],
        [-0.05457333,  0.04965371, -0.00918719, ..., -0.03703182,
          0.05383448, -0.00588278],
        [-0.07413442,  0.05554482, -0.01273862, ..., -0.08569293,
         -0.01008866,  0.01036541],
        ...,
        [-0.0281517 ,  0.04799322,  0.01016945, ..., -0.06962331,
          0.04106742, -0.01351814],
        [-0.04855607,  0.04665806, -0.03817695, ..., -0.08341216,
         -0.01696067, -0.00723822],
        [-0.07996511,  0.05540755, -0.03630012, ..., -0.07968821,
         -0.00023034,  0.00341752]]),
 'documents': ["Category: Kurta, Palazzos, Dupatta. Rating Count: 4522.0. Average Rating: 4.4183989385227775. Price: 5099.0. colour: Black. Brand: Khushal K. img: http://assets.myntassets.com/assets/images/17048

In [23]:
collection.get(
    ids = ['17048614']
)

{'ids': ['17048614'],
 'embeddings': None,
 'documents': ["Category: Kurta, Palazzos, Dupatta. Rating Count: 4522.0. Average Rating: 4.4183989385227775. Price: 5099.0. colour: Black. Brand: Khushal K. img: http://assets.myntassets.com/assets/images/17048614/2022/2/4/b0eb9426-adf2-4802-a6b3-5dbacbc5f2511643971561167KhushalKWomenBlackEthnicMotifsAngrakhaBeadsandStonesKurtawit7.jpg. attributes: {'Add-Ons': 'NA', 'Body Shape ID': '443,333,324,424', 'Body or Garment Size': 'Garment Measurements in', 'Bottom Closure': 'Slip-On', 'Bottom Fabric': 'Viscose Rayon', 'Bottom Pattern': 'Printed', 'Bottom Type': 'Palazzos', 'Character': 'NA', 'Dupatta': 'With Dupatta', 'Dupatta Border': 'Solid', 'Dupatta Fabric': 'Viscose Rayon', 'Dupatta Pattern': 'Printed', 'Main Trend': 'Indie Prints', 'Neck': 'Mandarin Collar', 'Number of Pockets': 'NA', 'Occasion': 'Festive', 'Ornamentation': 'NA', 'Pattern Coverage': 'Placement', 'Print or Pattern Type': 'Ethnic Motifs', 'Sleeve Length': 'Three-Quarter Sleeve

In [24]:
chroma_client.list_collections()

[Collection(name=Semantic_myntra_collection_chromadb)]

## Checking if the semantic search is working well

In [25]:
query = input()

results = collection.query(
    query_texts=query,
    n_results=5,
    include = ['documents', 'distances']
)

print(results)

I want to buy a jeans under 10000


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:00<00:00, 86.6MiB/s]


{'ids': [['15204772', '12379930', '8070243', '16262766', '17398308']], 'embeddings': None, 'documents': [["Category: Jeans. Rating Count: 192.0. Average Rating: 4.359375. Price: 1199.0. colour: Blue. Brand: High Star. img: http://assets.myntassets.com/assets/images/15204772/2021/8/20/b3a7af00-270a-4b91-9a7a-757082c2a6651629431266517HighStarWomenBlueBootcutHigh-RiseJeans1.jpg. attributes: {'Add-Ons': 'NA', 'Body or Garment Size': 'To-Fit Denotes Body Measurements in', 'Brand Fit Name': 'NA', 'Character': 'NA', 'Closure': 'Button and Zip', 'Contact Brand or Retailer for pre-sales product queries': 'mr.rohit mirchandani mobno7208331311', 'Distress': 'Clean Look', 'Effects': 'None', 'Fabric': 'Cotton', 'Fabric 2': 'NA', 'Fabric 3': 'NA', 'Fade': 'No Fade', 'Features': 'NA', 'Fit': 'Bootcut', 'Length': 'Regular', 'Main Trend': 'NA', 'Number of Pockets': '5', 'Occasion': 'Casual', 'Reversible': 'No', 'Shade': 'Medium', 'Stretch': 'Stretchable', 'Sustainable': 'Regular', 'Type of Distress': '

In [26]:
results

{'ids': [['15204772', '12379930', '8070243', '16262766', '17398308']],
 'embeddings': None,
 'documents': [["Category: Jeans. Rating Count: 192.0. Average Rating: 4.359375. Price: 1199.0. colour: Blue. Brand: High Star. img: http://assets.myntassets.com/assets/images/15204772/2021/8/20/b3a7af00-270a-4b91-9a7a-757082c2a6651629431266517HighStarWomenBlueBootcutHigh-RiseJeans1.jpg. attributes: {'Add-Ons': 'NA', 'Body or Garment Size': 'To-Fit Denotes Body Measurements in', 'Brand Fit Name': 'NA', 'Character': 'NA', 'Closure': 'Button and Zip', 'Contact Brand or Retailer for pre-sales product queries': 'mr.rohit mirchandani mobno7208331311', 'Distress': 'Clean Look', 'Effects': 'None', 'Fabric': 'Cotton', 'Fabric 2': 'NA', 'Fabric 3': 'NA', 'Fade': 'No Fade', 'Features': 'NA', 'Fit': 'Bootcut', 'Length': 'Regular', 'Main Trend': 'NA', 'Number of Pockets': '5', 'Occasion': 'Casual', 'Reversible': 'No', 'Shade': 'Medium', 'Stretch': 'Stretchable', 'Sustainable': 'Regular', 'Type of Distress':

In [27]:
import tiktoken
import openai

## Defining the logic for cache to fasten up search for already searched items

In [28]:
import os
from google.colab import userdata

# Load the API key from Colab secrets
os.environ['CHROMA_OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=os.environ['CHROMA_OPENAI_API_KEY'], model_name=model)


cache_collection = client.get_or_create_collection(name='Myntra_Cache', embedding_function=embedding_function)

cache_collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'included': ['metadatas', 'documents', 'embeddings'],
 'data': None,
 'metadatas': []}

In [29]:
query = input()

cache_results = cache_collection.query(
    query_texts=query,
    n_results=1
)

I want to buy blue jeans under 10000


In [30]:
cache_results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[]],
 'distances': [[]]}

In [31]:
# Implementing Cache in Semantic Search

# Set a threshold for cache search
threshold = 0.2

ids = []
documents = []
distances = []
metadatas = []
results_df = pd.DataFrame()

# If the distance is greater than the threshold, then return the results from the main collection.

# Check if cache_results['distances'][0] is not empty and if the first distance is greater than the threshold
if not cache_results['distances'][0] or cache_results['distances'][0][0] > threshold:
    # Query the collection against the user query and return the top 10 results
    results = collection.query(
        query_texts=query,
        n_results=10
    )

    # Prepare metadata for caching: store lists of ids, documents, and distances
    cache_metadata = {}
    if results.get('ids') and results['ids'][0]:
        cache_metadata['ids'] = ','.join(results['ids'][0]) # Convert list to string
    if results.get('documents') and results['documents'][0]:
        cache_metadata['documents'] = '||'.join(results['documents'][0]) # Convert list to string, using a different delimiter
    if results.get('distances') and results['distances'][0]:
        cache_metadata['distances'] = ','.join(map(str, results['distances'][0])) # Convert list to string

    # Store the query in cache_collection as document and results as metadata
    # Using the query itself as the ID for simplicity, ensure it's a valid ID (e.g., sanitize or hash if needed)
    # For this example, assuming query is a simple string and can be used as ID.
    # A more robust approach might involve hashing the query.
    try:
        cache_collection.add(
            documents=[query],
            ids=[query],
            metadatas=[cache_metadata]
        )
        print("Not found in cache. Found in main collection and added to cache.")
    except Exception as e:
        print(f"Error adding to cache: {e}")
        print("Not found in cache. Found in main collection.")


    result_dict = {'IDs': results.get('ids', [[]])[0],
                   'Documents': results.get('documents', [[]])[0],
                   'Distances': results.get('distances', [[]])[0]}

    # Ensure all lists in result_dict have the same length before creating DataFrame
    min_len = min(len(v) for v in result_dict.values())
    result_dict = {k: v[:min_len] for k, v in result_dict.items()}


    results_df = pd.DataFrame.from_dict(result_dict)


# If the distance is, however, less than the threshold, you can return the results from cache
# Check if cache_results['distances'][0] is not empty and if the first distance is less than or equal to the threshold
elif cache_results['distances'][0] and cache_results['distances'][0][0] <= threshold:
    print("Found in cache!")
    # Retrieve the metadata from the cache result
    cached_metadata = cache_results['metadatas'][0][0]

    # Extract information from cached metadata
    # Convert strings back to lists
    cached_ids = cached_metadata.get('ids', '').split(',')
    cached_documents = cached_metadata.get('documents', '').split('||')
    cached_distances = [float(d) for d in cached_metadata.get('distances', '').split(',') if d]

    # Ensure all lists have the same length before creating DataFrame
    min_len = min(len(cached_ids), len(cached_documents), len(cached_distances))
    result_dict = {
        'IDs': cached_ids[:min_len],
        'Documents': cached_documents[:min_len],
        'Distances': cached_distances[:min_len]
    }
    results_df = pd.DataFrame(result_dict)

display(results_df)

Not found in cache. Found in main collection and added to cache.


Unnamed: 0,IDs,Documents,Distances
0,15941726,Category: Jeans. Rating Count: 14.0. Average R...,0.916637
1,16262928,Category: Jeans. Rating Count: 8.0. Average Ra...,0.922742
2,15941714,Category: Jeans. Rating Count: 19.0. Average R...,0.930772
3,15852286,Category: Jeans. Rating Count: 13.0. Average R...,0.938038
4,17398312,Category: Jeans. Rating Count: 0.0. Average Ra...,0.938337
5,18620400,Category: Jeans. Rating Count: 0.0. Average Ra...,0.942064
6,10666988,Category: Jeans. Rating Count: 699.0. Average ...,0.942179
7,15204772,Category: Jeans. Rating Count: 192.0. Average ...,0.945423
8,13245974,Category: Jeans. Rating Count: 21.0. Average R...,0.947164
9,17398296,Category: Jeans. Rating Count: 0.0. Average Ra...,0.94946


## Final code to get inputs and then recommend

In [32]:
query = input("Enter your search query: ")

cache_results = cache_collection.query(
    query_texts=query,
    n_results=1,
    include=['documents', 'distances', 'metadatas'] # Include metadatas to retrieve cached results
)

# Set a threshold for cache search
threshold = 0.2

results_df = pd.DataFrame()

# Check if cache_results['distances'][0] is not empty and if the first distance is less than or equal to the threshold
if cache_results['distances'][0] and cache_results['distances'][0][0] <= threshold:
    print("Found in cache!")
    # Retrieve the metadata from the cache result
    cached_metadata = cache_results['metadatas'][0][0]

    # Extract information from cached metadata
    # Convert strings back to lists
    cached_ids = cached_metadata.get('ids', '').split(',')
    cached_documents = cached_metadata.get('documents', '').split('||')
    cached_distances = [float(d) for d in cached_metadata.get('distances', '').split(',') if d]

    # Ensure all lists have the same length before creating DataFrame
    min_len = min(len(cached_ids), len(cached_documents), len(cached_distances))
    result_dict = {
        'IDs': cached_ids[:min_len],
        'Documents': cached_documents[:min_len],
        'Distances': cached_distances[:min_len]
    }
    results_df = pd.DataFrame(result_dict)

else:
    # Query the collection against the user query and return the top 10 results
    results = collection.query(
        query_texts=query,
        n_results=10,
        include=['documents', 'distances', 'metadatas', 'ids']
    )

    # Prepare metadata for caching: store lists of ids, documents, and distances
    cache_metadata = {}
    if results.get('ids') and results['ids'][0]:
        cache_metadata['ids'] = ','.join(results['ids'][0]) # Convert list to string
    if results.get('documents') and results['documents'][0]:
        cache_metadata['documents'] = '||'.join(results['documents'][0]) # Convert list to string, using a different delimiter
    if results.get('distances') and results['distances'][0]:
        cache_metadata['distances'] = ','.join(map(str, results['distances'][0])) # Convert list to string

    # Store the query in cache_collection as document and results as metadata
    try:
        # Use a simple hash of the query as the ID for caching
        import hashlib
        cache_id = hashlib.md5(query.encode()).hexdigest()

        cache_collection.add(
            documents=[query],
            ids=[cache_id],
            metadatas=[cache_metadata]
        )
        print("Not found in cache. Found in main collection and added to cache.")
    except Exception as e:
        print(f"Error adding to cache: {e}")
        print("Not found in cache. Found in main collection.")

    result_dict = {'IDs': results.get('ids', [[]])[0],
                   'Documents': results.get('documents', [[]])[0],
                   'Distances': results.get('distances', [[]])[0]}

    # Ensure all lists in result_dict have the same length before creating DataFrame
    min_len = min(len(v) for v in result_dict.values())
    result_dict = {k: v[:min_len] for k, v in result_dict.items()}

    results_df = pd.DataFrame.from_dict(result_dict)

display(results_df)

Enter your search query: I want to buy a blue jeans under 10000
Found in cache!


Unnamed: 0,IDs,Documents,Distances
0,15941726,Category: Jeans. Rating Count: 14.0. Average R...,0.916637
1,16262928,Category: Jeans. Rating Count: 8.0. Average Ra...,0.922742
2,15941714,Category: Jeans. Rating Count: 19.0. Average R...,0.930772
3,15852286,Category: Jeans. Rating Count: 13.0. Average R...,0.938038
4,17398312,Category: Jeans. Rating Count: 0.0. Average Ra...,0.938337
5,18620400,Category: Jeans. Rating Count: 0.0. Average Ra...,0.942064
6,10666988,Category: Jeans. Rating Count: 699.0. Average ...,0.942179
7,15204772,Category: Jeans. Rating Count: 192.0. Average ...,0.945423
8,13245974,Category: Jeans. Rating Count: 21.0. Average R...,0.947164
9,17398296,Category: Jeans. Rating Count: 0.0. Average Ra...,0.94946


## Reranking using cross encoders

In [33]:
# Initialise the cross encoder model

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [34]:
# Input (query, response) pairs for each of the top 20 responses received from the semantic search to the cross encoder
# Generate the cross_encoder scores for these pairs

cross_inputs = [[query, response] for response in results_df['Documents']]
cross_rerank_scores = cross_encoder.predict(cross_inputs)

In [35]:
cross_rerank_scores

array([-2.0809975, -1.8981421, -1.8191924, -1.9890771, -2.4498684,
       -3.1642933, -1.9083858, -2.85545  , -1.5656662, -2.5267758],
      dtype=float32)

In [36]:
results_df['Reranked_scores'] = cross_rerank_scores

In [37]:
results_df.sort_values('Reranked_scores', ascending = False)

Unnamed: 0,IDs,Documents,Distances,Reranked_scores
8,13245974,Category: Jeans. Rating Count: 21.0. Average R...,0.947164,-1.565666
2,15941714,Category: Jeans. Rating Count: 19.0. Average R...,0.930772,-1.819192
1,16262928,Category: Jeans. Rating Count: 8.0. Average Ra...,0.922742,-1.898142
6,10666988,Category: Jeans. Rating Count: 699.0. Average ...,0.942179,-1.908386
3,15852286,Category: Jeans. Rating Count: 13.0. Average R...,0.938038,-1.989077
0,15941726,Category: Jeans. Rating Count: 14.0. Average R...,0.916637,-2.080997
4,17398312,Category: Jeans. Rating Count: 0.0. Average Ra...,0.938337,-2.449868
9,17398296,Category: Jeans. Rating Count: 0.0. Average Ra...,0.94946,-2.526776
7,15204772,Category: Jeans. Rating Count: 192.0. Average ...,0.945423,-2.85545
5,18620400,Category: Jeans. Rating Count: 0.0. Average Ra...,0.942064,-3.164293


In [38]:
# Return the top 3 results from semantic search

top_3_semantic = results_df.sort_values(by='Distances')
top_3_semantic[:3]

Unnamed: 0,IDs,Documents,Distances,Reranked_scores
0,15941726,Category: Jeans. Rating Count: 14.0. Average R...,0.916637,-2.080997
1,16262928,Category: Jeans. Rating Count: 8.0. Average Ra...,0.922742,-1.898142
2,15941714,Category: Jeans. Rating Count: 19.0. Average R...,0.930772,-1.819192


In [39]:
# Return the top 3 results after reranking

top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank[:3]

Unnamed: 0,IDs,Documents,Distances,Reranked_scores
8,13245974,Category: Jeans. Rating Count: 21.0. Average R...,0.947164,-1.565666
2,15941714,Category: Jeans. Rating Count: 19.0. Average R...,0.930772,-1.819192
1,16262928,Category: Jeans. Rating Count: 8.0. Average Ra...,0.922742,-1.898142


In [40]:
top_3_RAG = top_3_rerank[["Documents", "IDs"]][:3]
top_3_RAG

Unnamed: 0,Documents,IDs
8,Category: Jeans. Rating Count: 21.0. Average R...,13245974
2,Category: Jeans. Rating Count: 19.0. Average R...,15941714
1,Category: Jeans. Rating Count: 8.0. Average Ra...,16262928


## Generating the Prompt for using the LLM to fine tune the recommendations

In [41]:
def generate_response(query, results_df):
    """
    Generate a fashion product recommendation using GPT-4o based on the user query and top 3 relevant Myntra products.
    """
    messages = [
        {
            "role": "system",
            "content": "You are an expert fashion assistant helping users discover relevant fashion products on Myntra based on their style preferences, intent, or shopping queries."
        },
        {
            "role": "user",
            "content": f"""
You are an expert fashion assistant helping users discover the most relevant and stylish fashion products from the Myntra catalog based on their queries.

The user has asked the question: '{query}'.

You also have access to a set of top 3 relevant product search results retrieved from a product catalog in a dataframe called 'results_df'. Each row in this dataframe represents a product and contains the following columns:
- 'product_name': name of the product
- 'category': fine-grained product category
- 'rating_count': number of ratings
- 'average_rating': average customer rating
- 'description': short marketing or product-specific description
- 'metadata': can contain product ID or relevant metadata for citation

Your job is to:
- Recommend the most suitable product(s) based on the user's query and the top 3 results.
- Use the product description and other details (like rating, category) to justify the recommendation.
- Highlight what makes each product a good fit, if recommending more than one.
- Provide the image link as well for the recommended products
- Mention the price as well
- Only use information from the top 3 retrieved results in 'results_df'.
- If applicable, present tabular information (like ratings, features) clearly for comparison.
- End the answer with a citation indicating the product name(s) and corresponding metadata (e.g., product ID or source).


Guidelines:
1. Be concise, relevant, and helpful.
2. Don't use all 3 products if only one is truly relevant.
3. Avoid hallucinating details not present in the dataframe.
4. Avoid technical or internal terms — you are a customer-facing shopping assistant.
5. If none of the products are relevant, respond that no matching items were found in the top results and suggest refining the search.
6. Do not expose internal variable names or the dataframe — only respond with natural language.

Generate a complete and well-formatted user-facing response with clear recommendations followed by the citations.
"""
        }
    ]

    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )

    return response.choices[0].message.content.split('\n')



In [42]:
import os
from google.colab import userdata

# Load the OpenAI API key from Colab secrets
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [43]:
# Generate the response
print('query: ', query)
response = generate_response(query, top_3_RAG)

query:  I want to buy a blue jeans under 10000


In [44]:
print("\n".join(response))

Based on your request for blue jeans under ₹10,000, here are some top recommendations from the search results I have access to:

### Recommended Product

**1. Levi's Blue Skinny Fit Jeans**
- **Category:** Skinny Fit Jeans
- **Description:** Discover a perfect blend of style and comfort with Levi's iconic blue skinny fit jeans. Crafted for a flattering fit, these jeans offer a contemporary look that's versatile for any occasion.
- **Average Rating:** 4.5 (Based on 1500 ratings)
- **Price:** ₹8,999
- **Image Link:** [Levi's Blue Skinny Fit Jeans](https://example-link.com/bluejeans1)

This pair from Levi's stands out with its modern skinny fit, making it a trendy choice that still retains comfort for everyday wear. It's highly rated by users, ensuring both style and satisfaction.

### Comparison Table

| Product Name                | Category         | Average Rating | Price  |
|-----------------------------|------------------|----------------|--------|
| Levi's Blue Skinny Fit Jeans| Sk

## Defining the same into a function to run a loop for multiple recommendations

In [52]:
def recommend_products():
    """
    Given a user query, this function:
    1. Checks cache for previous semantic search results.
    2. If not found, performs semantic search on the main collection and caches the result.
    3. Reranks results using a cross-encoder.
    4. Extracts top 3 results for grounding.
    5. Uses an LLM to generate a final response using top 3 documents.

    Returns:
        - Generated response from the LLM (as a list of strings).
        - DataFrame of top 3 results.
    """
    import hashlib
    import pandas as pd

    query = input()
    # Step 1: Check Cache
    cache_results = cache_collection.query(
        query_texts=query,
        n_results=1,
        include=['documents', 'distances', 'metadatas']
    )

    threshold = 0.2
    results_df = pd.DataFrame()

    if cache_results['distances'][0] and cache_results['distances'][0][0] <= threshold:
        print("✅ Found in cache.")
        cached_metadata = cache_results['metadatas'][0][0]
        cached_ids = cached_metadata.get('ids', '').split(',')
        cached_documents = cached_metadata.get('documents', '').split('||')
        cached_distances = [float(d) for d in cached_metadata.get('distances', '').split(',') if d]
        min_len = min(len(cached_ids), len(cached_documents), len(cached_distances))
        result_dict = {
            'IDs': cached_ids[:min_len],
            'Documents': cached_documents[:min_len],
            'Distances': cached_distances[:min_len]
        }
        results_df = pd.DataFrame(result_dict)

    else:
        print("⚠️ Not found in cache. Querying main collection.")
        results = collection.query(
            query_texts=query,
            n_results=10,
            include=['documents', 'distances', 'metadatas']
        )

        # Cache the results
        cache_metadata = {}
        if results.get('ids') and results['ids'][0]:
            cache_metadata['ids'] = ','.join(results['ids'][0])
        if results.get('documents') and results['documents'][0]:
            cache_metadata['documents'] = '||'.join(results['documents'][0])
        if results.get('distances') and results['distances'][0]:
            cache_metadata['distances'] = ','.join(map(str, results['distances'][0]))

        try:
            cache_id = hashlib.md5(query.encode()).hexdigest()
            cache_collection.add(
                documents=[query],
                ids=[cache_id],
                metadatas=[cache_metadata]
            )
            print("✅ Cached result added.")
        except Exception as e:
            print(f"Error adding to cache: {e}")

        result_dict = {
            'IDs': results.get('ids', [[]])[0],
            'Documents': results.get('documents', [[]])[0],
            'Distances': results.get('distances', [[]])[0]
        }
        min_len = min(len(v) for v in result_dict.values())
        result_dict = {k: v[:min_len] for k, v in result_dict.items()}
        results_df = pd.DataFrame(result_dict)

    # Step 2: Rerank using Cross Encoder
    cross_inputs = [[query, doc] for doc in results_df['Documents']]
    cross_rerank_scores = cross_encoder.predict(cross_inputs)
    results_df['Reranked_scores'] = cross_rerank_scores

    # Step 3: Top 3 for RAG
    top_3_rerank = results_df.sort_values(by='Reranked_scores', ascending=False)
    top_3_RAG = top_3_rerank[["Documents", "IDs"]][:3]

    # Step 4: Generate response
    response = generate_response(query, top_3_RAG)

    return response, top_3_RAG

In [50]:
response, top_3_RAG = recommend_products()
print("\n".join(response))

I want to buy blue jeans under 10000. Give me few recommendations
✅ Found in cache.
Based on your request for blue jeans under ₹10,000, here are a couple of excellent options from the top results:

1. **Levi's Men's Slim Fit Jeans**
   - **Description**: These jeans offer a stylish slim fit that is perfect for both casual and semi-formal occasions. Featuring a classic blue wash, they're designed to provide comfort and durability.
   - **Category**: Men's Slim Fit Jeans
   - **Average Rating**: 4.5 (based on 215 ratings)
   - **Price**: ₹6,499
   - **Image**: ![Levi's Slim Fit Jeans](http://example.com/levi-slim-jeans-image)
   - **What makes it a great fit**: Known for its high-quality denim, Levi's is a trusted brand, and this pair has a strong customer approval with its high ratings.

2. **Wrangler Regular Fit Blue Jeans**
   - **Description**: These jeans feature a classic blue color with a regular fit, ideal for those looking for a more relaxed look without compromising on style.
 

In [53]:
response2, top_3_RAG2 = recommend_products()
print("\n".join(response2))

I want to buy a pair of shoes under 30000
⚠️ Not found in cache. Querying main collection.
✅ Cached result added.
Based on your query for shoes under ₹30,000, I've evaluated the top three results to find the most suitable option for you. Here's the recommended product:

### Recommended Shoe:
**Product Name:** Aldo Men's Formal Leather Shoes  
**Category:** Formal Shoes  
**Average Rating:** 4.5 out of 5 (200 ratings)  
**Price:** ₹25,999  
**Description:** These elegant leather shoes from Aldo offer a sophisticated design perfect for formal occasions. Crafted with premium materials, they ensure comfort and style befitting a professional setting.

**Why This Is A Good Fit:**
- **Stylish and Formal:** Perfect for formal events, these shoes bring a classic yet modern touch.
- **High Customer Satisfaction:** With a robust rating from numerous customers, these shoes are well-regarded for their quality.
- **Under Budget:** Priced comfortably within your specified budget.

**Image Link:** [Vi

In [54]:
response3, top_3_RAG3 = recommend_products()
print("\n".join(response3))

I want to buy a backpack with adventure capability. Give me 3 best recommendations
⚠️ Not found in cache. Querying main collection.
✅ Cached result added.
Based on your interest in a backpack with adventure capability, I have analyzed the top three products retrieved from the catalog. Here are the best recommendations that suit your needs:

1. **Product Name: Wildcraft Rucksack**
   - **Category**: Rucksack
   - **Description**: This rucksack is designed for adventure seekers. It features multiple compartments for organized storage, sturdy shoulder straps for comfort during long treks, and is crafted with durable material that can withstand rough conditions.
   - **Average Rating**: 4.5 out of 5
   - **Rating Count**: 800 ratings
   - **Price**: ₹2,499
   - **Image Link**: [Wildcraft Rucksack](http://example.com/wildcraft-rucksack) (hypothetical link)
   
   **Why It's a Good Fit**: This rucksack is specifically built for adventurous activities, offering both durability and functionali

## End