In [4]:
import os
from pathlib import Path

# Configuration - Use environment variables for sensitive data
config = {
    'data_path': Path.cwd() / 'Data',
    'chunk_size': 1000,
    'chunk_overlap': 200,
    'vector_store_name': 'faiss_index',
    'embedding_model': 'all-MiniLM-L6-v2',
    'refresh_vector_store': False,
    'cross_encoder_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2',
    'domain': 'fashion',
    'chat_model': 'sonar-pro',
    'top_k': 10
}

In [11]:
# load the key from .env file 

import dotenv
dotenv.load_dotenv()
os.environ['PPLX_API_KEY'] = dotenv.get_key('.env', 'PPLX_API_KEY')
PPLX_API_KEY = os.getenv('PPLX_API_KEY')
if not PPLX_API_KEY:
    print("Warning: PPLX_API_KEY not found in environment variables")
else:
    print("key found in env")

key found in env


In [None]:
# Install required packages
! pip install -qU langchain-community pymupdf langchain_huggingface sentence-transformers langchain-perplexity

In [12]:
# Import libraries
from langchain_community.document_loaders import PyMuPDFLoader, CSVLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from sentence_transformers import CrossEncoder
from langchain_core.prompts import ChatPromptTemplate
from langchain_perplexity import ChatPerplexity
import json
from typing import List, Dict, Optional

In [14]:
class SemanticSpotter:
    def __init__(self, config: Dict):
        self.config = config
        self._embedding_model = None
        self._vector_store = None
        self._cross_encoder = None
        self._chat_client = None
    
    @property
    def embedding_model(self):
        if self._embedding_model is None:
            self._embedding_model = HuggingFaceEmbeddings(
                model_name=self.config['embedding_model'],
                show_progress=True
            )
        return self._embedding_model
    
    @property
    def cross_encoder(self):
        if self._cross_encoder is None:
            self._cross_encoder = CrossEncoder(self.config['cross_encoder_model'])
        return self._cross_encoder
    
    @property
    def chat_client(self):
        if self._chat_client is None and PPLX_API_KEY:
            self._chat_client = ChatPerplexity(
                temperature=0,
                pplx_api_key=PPLX_API_KEY,
                model=self.config['chat_model']
            )
        return self._chat_client
    
    def load_documents(self, folder_path: Path) -> List:
        """Load documents from folder with error handling"""
        all_documents = []
        supported_extensions = {'.pdf': PyMuPDFLoader, '.csv': CSVLoader, '.txt': TextLoader}
        
        for file_path in folder_path.iterdir():
            if file_path.suffix.lower() in supported_extensions:
                try:
                    loader_class = supported_extensions[file_path.suffix.lower()]
                    loader = loader_class(str(file_path))
                    documents = loader.load()
                    all_documents.extend(documents)
                    print(f"Loaded {len(documents)} documents from {file_path.name}")
                except Exception as e:
                    print(f"Error loading {file_path.name}: {e}")
        
        return all_documents
    
    def create_chunks(self, documents: List) -> List:
        """Split documents into chunks"""
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.config['chunk_size'],
            chunk_overlap=self.config['chunk_overlap']
        )
        return text_splitter.split_documents(documents)
    
    def setup_vector_store(self) -> Optional[FAISS]:
        """Setup or load vector store"""
        vector_store_path = self.config['vector_store_name']
        
        # Try to load existing vector store first
        if not self.config['refresh_vector_store'] and os.path.exists(vector_store_path):
            try:
                self._vector_store = FAISS.load_local(
                    vector_store_path, 
                    self.embedding_model, 
                    allow_dangerous_deserialization=True
                )
                print("Loaded existing vector store")
                return self._vector_store
            except Exception as e:
                print(f"Error loading vector store: {e}. Creating new one...")
        
        # Create new vector store
        if not self.config['data_path'].exists():
            print(f"Data path {self.config['data_path']} does not exist")
            return None
        
        documents = self.load_documents(self.config['data_path'])
        if not documents:
            print("No documents found")
            return None
        
        chunks = self.create_chunks(documents)
        print(f"Created {len(chunks)} chunks")
        
        self._vector_store = FAISS.from_documents(chunks, self.embedding_model)
        self._vector_store.save_local(vector_store_path)
        print("Vector store created and saved")
        
        return self._vector_store
    
    def rerank_results(self, query: str, results: List) -> List:
        """Rerank results using cross-encoder"""
        if not results:
            return results
        
        pairs = [[query, doc.page_content] for doc in results]
        scores = self.cross_encoder.predict(pairs)
        
        # Sort by score (descending)
        scored_results = list(zip(results, scores))
        scored_results.sort(key=lambda x: x[1], reverse=True)
        
        return [doc for doc, score in scored_results]
    
    def search(self, query: str, use_reranking: bool = True) -> List:
        """Search for relevant documents"""
        if not self._vector_store:
            print("Vector store not initialized")
            return []
        
        results = self._vector_store.similarity_search(query, k=self.config['top_k'])
        
        if use_reranking and results:
            results = self.rerank_results(query, results)
        
        return results
    
    def generate_response(self, query: str, results: List) -> Optional[str]:
        """Generate LLM response"""
        if not self.chat_client:
            print("Chat client not available. Check PPLX_API_KEY")
            return None
        
        if not results:
            return json.dumps({"message": "No relevant products found"})
        
        context = "\n\n".join([doc.page_content for doc in results])
        
        system_prompt = """
        You are a helpful assistant in {domain} domain.
        You help users find products from the given context.
        Return a JSON array of products with these fields: name, gender, material, fit, features, sizes, price, notes, source, images.
        Leave fields blank if no information is available.

        Context: {context}
        """
        
        prompt = ChatPromptTemplate.from_messages([
            ("system", system_prompt),
            ("human", "{query}")
        ])
        
        try:
            chain = prompt | self.chat_client
            response = chain.invoke({
                "context": context,
                "domain": self.config['domain'],
                "query": query
            })
            return response.content
        except Exception as e:
            print(f"Error generating response: {e}")
            return None
    
    def query(self, user_query: str) -> Optional[str]:
        """Main query method"""
        results = self.search(user_query)
        return self.generate_response(user_query, results)

In [None]:
# Initialize the semantic spotter and set up vector store

spotter = SemanticSpotter(config)
vector_store = spotter.setup_vector_store()

if vector_store:
    print("Vector store created. Now ready for queries!")
else:
    print("Failed to initialize vector store")

Loaded existing vector store
Vector store created. Now ready for queries!


In [21]:
query = "van heusen formal shirts"

In [23]:
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 1})
res = retriever.invoke(query)

Batches: 100%|██████████| 1/1 [00:00<00:00,  5.99it/s]


{'source': '/Users/RajivGaba/aiml_projects/Semantic Spotter/Data/Myntra-Ecommerce__20231101_20231130_sample.csv', 'row': 6}


In [17]:
# Example query
if vector_store:
    query = "van heusen formal shirts"
    response = spotter.query(query)
    
    if response:
        try:
            # Pretty print JSON response
            parsed_response = json.loads(response)
            print(json.dumps(parsed_response, indent=2))
        except json.JSONDecodeError:
            print(response)
    else:
        print("No response generated")

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.17it/s]


```json
[
  {
    "name": "Van Heusen Men's Poplin Dress Shirt, Classic & Athletic Fit",
    "gender": "Men",
    "material": "",
    "fit": "Classic & Athletic Fit",
    "features": "Wrinkle free, poplin fabric",
    "sizes": "",
    "price": "24.99 (50% off, original $50.00)",
    "notes": "Limited-Time Special. Bonus offer with purchase.",
    "source": "Macy's",
    "images": ""
  },
  {
    "name": "Van Heusen Men's Classic/Regular Fit Stretch Wrinkle Free Sateen Dress Shirt",
    "gender": "Men",
    "material": "Sateen (stretch, wrinkle free)",
    "fit": "Classic/Regular Fit",
    "features": "Stretch, wrinkle free",
    "sizes": "",
    "price": "24.99 (50% off, original $50.00)",
    "notes": "Limited-Time Special. Bonus offer with purchase.",
    "source": "Macy's",
    "images": ""
  },
  {
    "name": "Van Heusen Men's Regular-Fit Ultraflex Dress Shirt",
    "gender": "Men",
    "material": "",
    "fit": "Regular Fit",
    "features": "Ultraflex",
    "sizes": "",
    "pr

In [19]:
# Interactive query function
def interactive_search():
    """Interactive search interface"""
    if not vector_store:
        print("Vector store not available")
        return
    
    while True:
        query = input("\nEnter your search query (or 'quit' to exit): ").strip()
        
        if query.lower() in ['quit', 'exit', 'q']:
            break
        
        if not query:
            continue
        
        print(f"\nSearching for: {query}")
        response = spotter.query(query)
        
        if response:
            try:
                parsed_response = json.loads(response)
                print(json.dumps(parsed_response, indent=2))
            except json.JSONDecodeError:
                print(response)
        else:
            print("No response generated")

In [20]:
# Uncomment to run interactive search
interactive_search()


Searching for: wrist band


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.39it/s]


```json
[
  {
    "name": "",
    "gender": "Men",
    "material": "",
    "fit": "",
    "features": "Regular length, Long sleeves, Regular sleeves, Machine wash",
    "sizes": "",
    "price": "",
    "notes": "Product appears to be a men's shirt from Arrow, not a wrist band.",
    "source": "",
    "images": [
      "http://assets.myntassets.com/assets/images/productimage/2021/2/9/36e0ba42-e45c-4fb4-8ab8-193e1bb444021612857623859-1.jpg",
      "http://assets.myntassets.com/assets/images/productimage/2021/2/9/f4e66868-0951-4661-aec1-d32228895d491612857623883-2.jpg",
      "http://assets.myntassets.com/assets/images/productimage/2021/2/9/72449bea-4951-4973-a82b-4e2093bb173a1612857623904-3.jpg"
    ]
  },
  {
    "name": "",
    "gender": "Men",
    "material": "",
    "fit": "",
    "features": "Regular length, Short sleeves, Regular sleeves, Machine wash, Solid pattern",
    "sizes": "",
    "price": "",
    "notes": "Product appears to be a men's shirt from U.S. Polo Assn., not a wr

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.89it/s]


```json
[
  {
    "name": "U.S. Polo Assn. Men Navy Blue Solid Tailored Fit Pure Cotton Shirt",
    "gender": "Men",
    "material": "Pure Cotton",
    "fit": "Tailored Fit",
    "features": "Solid pattern, Regular length, Short sleeves, Regular sleeves, Machine wash",
    "sizes": "",
    "price": "",
    "notes": "Not a trouser, but a formal shirt option from U.S. Polo Assn. for pairing with formal trousers.",
    "source": "U.S. Polo Assn.",
    "images": [
      "http://assets.myntassets.com/assets/images/19167196/2022/12/7/f89fa144-6320-4154-94eb-0c84c1151efa1670394020027-U-S-Polo-Assn-Men-Navy-Blue-Solid-Tailored-Fit-Pure-Cotton-C-1.jpg",
      "http://assets.myntassets.com/assets/images/19167196/2022/12/7/b2e92ee6-489d-410c-a1f4-733329add4d71670394020002-U-S-Polo-Assn-Men-Navy-Blue-Solid-Tailored-Fit-Pure-Cotton-C-2.jpg"
    ]
  },
  {
    "name": "Arrow Men Regular Fit Long Sleeve Shirt",
    "gender": "Men",
    "material": "",
    "fit": "Regular Fit",
    "features": "Regula