In [1]:
# install required packages

! pip install -qU langchain-community
! pip install -qU langchain_huggingface
! pip install -qU sentence-transformers
! pip install -qU langchain-perplexity
! pip install -qU faiss-cpu
! pip install -qU gradio
! pip install -qU kaggle

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.5/447.5 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.6/48.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
datasets 3.6.0 requires fsspec[http]<=2025.3.0,>=2023.1.0, but you have fsspec 2025.5.1 which is incompatible.

In [2]:
# Filter warnings

import warnings
warnings.filterwarnings('ignore')

In [3]:
# import libraries

import os, glob
from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader, CSVLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from sentence_transformers import CrossEncoder, util
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_perplexity import ChatPerplexity
from langchain.retrievers import ContextualCompressionRetriever
from langchain.memory import ConversationBufferMemory
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_core.messages import HumanMessage, AIMessage
import gradio as gr
from PIL import Image

2025-10-01 11:49:44.235080: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759319384.560972      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759319384.652262      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# Set LLM keys from secret manager / local environment

if os.path.exists('/kaggle'):
    platform = 'Kaggle'
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    print(f"Using kaggle secrets to get keys")
    os.environ['PERPLEXITY_API_KEY']  = user_secrets.get_secret("PPLX_API_KEY_2")   
elif os.path.exists('/content'):
    platform = 'Colab'
    from google.colab import userdata
    print(f"Using Google colab secrets to get keys")
    os.environ['PERPLEXITY_API_KEY'] = userdata.get('PPLX_API_KEY_2')
else:
    platform = 'Local'
    import dotenv
    dotenv.load_dotenv()
    print(f"Using local env secrets to get keys")
    os.environ['PERPLEXITY_API_KEY'] = os.getenv('PPLX_API_KEY')    

Using kaggle secrets to get keys


In [5]:
# Define a base configuration

config = {
    'data_path' : '/kaggle/input/myntra-fashion-product-dataset/',
    'images_path' : '/kaggle/input/myntra-fashion-product-dataset/images/',
    'chunk_size' : 512,
    'chunk_overlap' : 80,
    'vector_store_name' : "faiss_myntra_db",
    'embedding_model' : 'all-MiniLM-L6-v2',
    'refresh_vector_store' : 'N',
    'PPLX_API_KEY' : os.getenv('PERPLEXITY_API_KEY'),
    'domain' : 'fashion',
    'chat_model' : "sonar-pro",
    'rerank_model' : 'BAAI/bge-reranker-base',
    'platform' : platform
}

In [6]:
# Download dataset if not running on kaggle notebook

if config['platform'] != 'Kaggle':
    # Download dataset from kaggle using kaggle API
    from kaggle.api.kaggle_api_extended import KaggleApi
    
    api = KaggleApi()
    api.authenticate()
    # api.dataset_download_files('promptcloud/myntra-e-commerce-product-data-november-2023', path='./data/', unzip=True)
    # api.dataset_download_files("ronakbokaria/myntra-products-dataset", path='./data/', unzip=True)
    api.dataset_download_files("djagatiya/myntra-fashion-product-dataset", path='./data/', unzip=True)

In [7]:
# Define a function that will enhance the metadata of the documents

def add_metadata_to_documents(documents):
    """
    Adds image URL and other attributes from page content to the metadata of each document.
    """
    for doc in documents:
        try:
            image_url = doc.page_content.split("\n")[6].split(' ')[1]
            doc.metadata['image_url'] = image_url
            pid = doc.page_content.split('\n')[0].split()[1]
            doc.metadata['image_path_local'] = config['images_path'] + pid + ".jpg"
            doc.metadata['p_id'] = pid
            doc.metadata['product_name'] = doc.page_content.split("\n")[1].split(' ')[1]
            doc.metadata['product_category'] = doc.page_content.split("\n")[2].split(' ')[1]
            doc.metadata['price'] = doc.page_content.split("\n")[3].split(' ')[1]
            doc.metadata['color'] = doc.page_content.split("\n")[4].split(' ')[1]
            doc.metadata['brand'] = doc.page_content.split("\n")[5].split(' ')[1]
            doc.metadata['rating_count'] = doc.page_content.split("\n")[7].split(' ')[1]
            doc.metadata['avg_rating'] = doc.page_content.split("\n")[8].split(' ')[1]
        except (IndexError, AttributeError):
            # Handle cases where the image URL might not be present or in a different format
            doc.metadata['image_url'] = None
            doc.metadata['image_path_local'] = None
            doc.metadata['p_id'] = None
            doc.metadata['product_name'] = None
            doc.metadata['product_category'] = None
            doc.metadata['price'] = None
            doc.metadata['price'] = None
            doc.metadata['brand'] = None
            doc.metadata['rating_count'] = None
            doc.metadata['avg_rating'] = None
    return documents

In [8]:
def get_data_chunks(folder_path):
  """
  This function will create chunks from the files present in the dataset.
  This takes directory where the files exists. Basis the type of file i.e.
  PDF, CSV or text, a loader is initialised and chunks are created from
  content of the data
  """
  # loader = PyMuPDFLoader(pdf_file)
  # documents = loader.load()

  all_documents = []

  # Loop through all files in the folder
  for file_path in folder_path.iterdir():
      if file_path.suffix.lower() == ".pdf":
          loader = PyMuPDFLoader(str(file_path))
      elif file_path.suffix.lower() == ".csv":
          loader = CSVLoader(str(file_path))
      elif file_path.suffix.lower() == ".txt":
          loader = TextLoader(str(file_path))
      else:
          continue  # Skip unsupported file types

      # Load and append documents
      documents = loader.load()
      documents_with_metadata = add_metadata_to_documents(documents)
      all_documents.extend(documents_with_metadata)

  # chunking/splitting
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size=config['chunk_size'],
      chunk_overlap=config['chunk_overlap'],
      strip_whitespace=True,
      separators=["\n\n", "\n", " ", ""]
  )
  text_chunks = text_splitter.split_documents(documents=documents_with_metadata)
  return text_chunks

In [9]:
def get_embeddings_model():
  embedding_model = HuggingFaceEmbeddings(
      model_name=config['embedding_model'],
      show_progress=True,
      multi_process=True,
      model_kwargs={'device': 'cuda'}
  )
  return embedding_model

In [10]:
def create_vector_store(text_chunks, embedding_model):
  if config['refresh_vector_store'] == 'Y' or not os.path.exists(config['vector_store_name']):
      vector_store = FAISS.from_documents(text_chunks, embedding_model)
      vector_store.save_local(config['vector_store_name'])
  else:
      vector_store = FAISS.load_local(config['vector_store_name'], embedding_model, allow_dangerous_deserialization=True)
  return vector_store

In [11]:
def create_chat_client():
  return ChatPerplexity(
      temperature=0,
      pplx_api_key=config['PPLX_API_KEY'], # Pass the API key explicitly
      model=config['chat_model']
  )

In [12]:
def get_retriever(top_k=10):
  retriever = vector_store.as_retriever(search_kwargs={'k': top_k})
  return retriever

In [13]:
def get_reranked_query_results(query):
    model = HuggingFaceCrossEncoder(model_name=config['rerank_model'])
    compressor = CrossEncoderReranker(model=model, top_n=3)
    compression_retriever = ContextualCompressionRetriever(
      base_compressor=compressor,
      base_retriever=get_retriever()
    )
    compressed_docs = compression_retriever.invoke(query)
    print("="*80)
    print(f"{compressed_docs}")
    print("="*80)
    
    return compressed_docs

In [14]:
def generate_llm_response(query, results):
    llm = create_chat_client()
    #if system_message is None:
    system_message = f""" 
    You are a helpful AI assistant in fashion domain and expert in looking into given documents and find relevant products. 
    Do not give any product listing outside this context.
    Context is give here:
    #####
    {results}
    #####
    If you don't know the answer, say so. Keep the conversation flowing.
    
    #####
    You extract brand, price, avg_rating, rating_count, image_url and image_path_local from the metadata
    #####
    """        
    prompt_template = ChatPromptTemplate.from_messages([
        ("system", "{system_message}"),
        ("human", "{query}")
    ])
    
    chain = prompt_template | llm
    llm_response = chain.invoke(
        {"query" : query,
        "system_message" : system_message}
    )
    return llm_response

In [15]:
def rag_pipeline(user_input):
    # Get reranked top results of the user query
    retrieved_documents = get_reranked_query_results(user_input)
    # formatted_context = "\n\n".join(doc.page_content for doc in retrieved_documents)
    formatted_context = "\n\n".join( (str(doc.metadata) + doc.page_content) for doc in retrieved_documents)
    print("*"*80)
    print(f"\n\n\n {formatted_context} \n\n\n")
    print("*"*80)
    answer = generate_llm_response(user_input, formatted_context)
    return answer

In [16]:
def get_answer(question):
    # Call the RAG pipeline to get the answer based on the user's question.
    final_answer = rag_pipeline(question)
    return final_answer

In [17]:
# Start the process and set path for dataset

chunked_data = []

if (config['platform']).lower() == 'kaggle':
    folder_path = Path("/kaggle/input/myntra-fashion-product-dataset")
elif (config['platform']).lower() == 'colab':
    folder_path = Path("/content/data/")
else:
    folder_path = Path(config['data_path'])

print(folder_path)

/kaggle/input/myntra-fashion-product-dataset


In [18]:
if config['refresh_vector_store'] == 'Y':
    # Generate chunks from the dataset
    chunked_data = get_data_chunks(folder_path)

    # Create embeddings and put them into a vector store
    embedding_model = get_embeddings_model()
    vector_store = create_vector_store(chunked_data, embedding_model)
else:
    vector_store = FAISS.load_local(
        config['vector_store_name'],
        get_embeddings_model(),
        allow_dangerous_deserialization=True
    )

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [19]:
user_input = "party dresses for women"

In [24]:
import html

def gradio_chat_interface(user_input):
    results = get_reranked_query_results(user_input)
    response = generate_llm_response(user_input, results)
    
    # Clean HTML entities
    cleaned_text = html.unescape(response.content.replace('"', '"').replace('&', '&'))
    
    # Get product images
    images = []
    for doc in results[:3]:
        image_path = doc.metadata.get('image_path_local')
        if image_path and os.path.exists(image_path):
            try:
                images.append(Image.open(image_path))
            except:
                images.append(None)
        else:
            images.append(None)
    
    while len(images) < 3:
        images.append(None)
    
    return cleaned_text, images[0], images[1], images[2]



In [25]:
# Create Gradio interface
with gr.Blocks(title="Semantic Spotter") as demo:
    gr.Markdown("# 🛍️ Semantic Spotter - Fashion Search")
    
    with gr.Row():
        user_input = gr.Textbox(label="Search Query", placeholder="party dresses for women", scale=3)
        submit_btn = gr.Button("Search", variant="primary", scale=1)
    
    response_text = gr.Textbox(label="Recommendations", lines=6, interactive=False)
    
    with gr.Row():
        img1 = gr.Image(label="Product 1", height=200)
        img2 = gr.Image(label="Product 2", height=200) 
        img3 = gr.Image(label="Product 3", height=200)
    
    submit_btn.click(gradio_chat_interface, [user_input], [response_text, img1, img2, img3])
    user_input.submit(gradio_chat_interface, [user_input], [response_text, img1, img2, img3])

demo.launch(share=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://eb0b3a2a4989e4e809.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


