In [2]:
pplx_api_key = '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'

In [3]:
# configurations

config = {
    'data_path' : '/Users/RajivGaba/aiml_projects/Semantic Spotter/Data/',
    'chunk_size' : 1000,
    'chunk_overlap' : 200,
    'vector_store_name' : "faiss_index",
    'hf_token' : "*****",
    'embedding_model' : 'all-MiniLM-L6-v2',
    'refresh_vector_store' : 'Y',
    'cross_encoder_model': 'cross-encoder/ms-marco-MiniLM-L-6-v2',
    'PPLX_API_KEY' : pplx_api_key,
    'domain' : 'fashion',
    'chat_model' : "sonar-pro"
}

In [4]:
# install required packages

! pip install -qU langchain-community pymupdf
! pip install -qU langchain_huggingface
! pip install sentence-transformers
! pip install -qU "langchain-perplexity"



In [5]:
"""
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()
api.dataset_download_files('promptcloud/myntra-e-commerce-product-data-november-2023', path='./data/', unzip=True)
"""

"\nimport kaggle\nfrom kaggle.api.kaggle_api_extended import KaggleApi\napi = KaggleApi()\napi.authenticate()\napi.dataset_download_files('promptcloud/myntra-e-commerce-product-data-november-2023', path='./data/', unzip=True)\n"

In [6]:
# import libraries

import os, glob
import importlib

# Restart kernel if needed
try:
    importlib.reload(importlib)
except:
    pass

from pathlib import Path
from langchain_community.document_loaders import PyMuPDFLoader, CSVLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from sentence_transformers import CrossEncoder, util
from langchain_core.prompts import ChatPromptTemplate
from langchain_perplexity import ChatPerplexity

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Define reusable functions

def get_data_chunks(folder_path):
    # loader = PyMuPDFLoader(pdf_file)
    # documents = loader.load()

    all_documents = []

    # Loop through all files in the folder
    for file_path in folder_path.iterdir():
        if file_path.suffix.lower() == ".pdf":
            loader = PyMuPDFLoader(str(file_path))
        elif file_path.suffix.lower() == ".csv":
            loader = CSVLoader(str(file_path))
        elif file_path.suffix.lower() == ".txt":
            loader = TextLoader(str(file_path))
        else:
            continue  # Skip unsupported file types

        # Load and append documents
        documents = loader.load()
        all_documents.extend(documents)    

    # chunking/splitting
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=config['chunk_size'],
        chunk_overlap=config['chunk_overlap']
    )
    text_chunks = text_splitter.split_documents(documents=all_documents)
    return text_chunks

def get_embeddings_model():
    embedding_model = HuggingFaceEmbeddings(model_name=config['embedding_model'], show_progress=True)
    return embedding_model

def create_vector_store(text_chunks, embedding_model):
    if config['refresh_vector_store'] == 'Y' and os.path.exists(config['vector_store_name']):
        vector_store = FAISS.from_documents(text_chunks, embedding_model)
        vector_store.save_local(config['vector_store_name'])
    else:
        vector_store = FAISS.load_local(config['vector_store_name'], embedding_model, allow_dangerous_deserialization=True)

def get_cross_encoder_score(query, results):
    cross_encoder = CrossEncoder(config['cross_encoder_model'])
    for i, res in enumerate(results):
        ce_score = cross_encoder.predict([query, res.page_content])
        print(ce_score)

def create_chat_client():
    return ChatPerplexity(
        temperature=0, 
        pplx_api_key=config['PPLX_API_KEY'], 
        model=config['chat_model']
    )

def get_llm_response(query, results, domain):
    output_json = {
        "name": "<<value>>",
        "gender": "<<value>>",
        "material": "<<value>>",
        "fit": "<<value>>",
        "features": "<<value>>",
        "sizes": "<<value>>",
        "price": "<<value>>",
        "notes": "<<value>>",
        "source": "<<value>>",
        "images" : "<<value>>"
    }

    system = """You are a helpful assistant in {domain} domain. 
    You are here to help users find products for their requirements from a given list of products.
    #####
    Here is the context: {context}
    #####
    You return the output in JSON format that has all elements as present in {output_json}. If there is no value in any of the fields, leave it blank.
    """

    human = "{query}"

    prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

    chat = create_chat_client()

    chain = prompt | chat
    response = chain.invoke(
        {
            "context" : results,  
            "domain" : config['domain'], 
            "query": query,
            "output_json" : output_json
        }
    )
    return response.content

In [8]:
if __name__ == "__main__":
    chunked_data = []
    folder_path = Path(config['data_path'])

    # Step 1: Generate chunks from the dataset 
    if config['refresh_vector_store'] == 'Y':
        chunked_data = get_data_chunks(folder_path)
    
    # Step 2: Create embeddings and put them into a vector store
    embedding_model = get_embeddings_model()
    create_vector_store(chunked_data, embedding_model)
    vector_store = FAISS.load_local(config['vector_store_name'], embedding_model, allow_dangerous_deserialization=True)

    # Step 3: Initiate LLM for conversation with user
    
    
    # get_cross_encoder_score(query, results)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.02it/s]


In [16]:
%%time 

query = "van heusen formal shirts"
results = vector_store.similarity_search(query, k=20)
# print(get_llm_response(query, results, config['domain']))

Batches: 100%|██████████| 1/1 [00:00<00:00, 11.35it/s]

CPU times: user 6.56 ms, sys: 52.5 ms, total: 59.1 ms
Wall time: 91.4 ms





In [10]:
results

[Document(id='fb171b5f-60f1-414f-ba7d-809e9f60da55', metadata={'source': '/Users/RajivGaba/aiml_projects/Semantic Spotter/Data/Myntra-Ecommerce__20231101_20231130_sample.csv', 'row': 6}, page_content='Uniq Id: 06bd6038b28cee422d1aaeda647e6fe8\nCrawl Timestamp: 2023-11-02 08:24:48 +0000\nPageurl: https://www.myntra.com/13619958\nPdp Url: https://www.myntra.com/13619958\nProduct Id: 13619958\nProduct Name: Arrow Men White & Black Checked Formal Shirt\nList Price: 2399\nSale Price: 2399\nDiscount Percentage: 0.0\nBrand: Arrow\nProduct Rank: NA\nSize: 38\nAvailability: No\nProduct Description: White checked formal shirt, has a spread collar, button placket, and curved hem\nSeller: \nCategory Url: https://www.myntra.com/shirts\nInput Seed Url: https://www.myntra.com/13619958\nColor: White\nAverage Rating: 0.0\nTotal Number Of Ratings: 0\nType: \nSleeve: \nFit: Regular Fit\nFabric Or Material: Cotton\nNeck: \nPattern: Checked\nFabric Care: \nClosure: \nSuitable For: \nCollar: \nIdeal For: \n

In [17]:
%%time 

retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 10})
results = retriever.invoke(query)

Batches: 100%|██████████| 1/1 [00:00<00:00,  7.45it/s]

CPU times: user 8.77 ms, sys: 58.4 ms, total: 67.1 ms
Wall time: 142 ms



