In [1]:
from langchain_community.document_loaders import DirectoryLoader,CSVLoader
class Dataloader:
    def __init__(self, folder_path: str = "data"):
        self.loader = DirectoryLoader(folder_path, glob="*.csv", loader_cls=CSVLoader)
        self.documents = self.loader.load()

    def info(self):
        print(f"Loaded {len(self.documents)} documents")
        print(f"\nFirst document preview:")
        print(self.documents[0].page_content[:200] + "...")

        print(f"\nLast document preview:")
        print(self.documents[-1].page_content[:200] + "...")

dataloader = Dataloader("./data")
dataloader.documents


#####################3
print(dataloader.info())


  from .autonotebook import tqdm as notebook_tqdm


Loaded 450 documents

First document preview:
product_id: ACCFZGAQJGYCYDCM
product_title: BoAt Rockerz 235v2 with ASAP charging Version 5.0 Bluetooth Headset
rating: 5
summary: Terrific purchase
review: 1-more flexible2-bass is very high3-sound c...

Last document preview:
product_id: ACCEVQZABYWJHRHF
product_title: BoAt BassHeads 100 Wired Headset
rating: 4
summary: Wonderful
review: a very good earphone in budget sound is awesome very pure sound but the bass is ok ok....
None


In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

class Splitter:
    def __init__(self, documents, chunk_size: int = 500, chunk_overlap: int = 50):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        self.chunks = self.text_splitter.split_documents(documents)

    def info(self):
        print(f"Created {len(self.chunks)} chunks")
        print(f"\nFirst chunk preview:")
        print(self.chunks[0].page_content[:200] + "...")
        print(self.chunks[-1].page_content[:200] + "...")


In [3]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import os
from dotenv import load_dotenv

load_dotenv()
hf_key = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if hf_key:
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_key
    

class Embed:
    def __init__(self, model_name="BAAI/bge-small-en-v1.5"):
        self.embed = HuggingFaceBgeEmbeddings(
            model_name=model_name,
            model_kwargs={"device": "cpu"}
        )
    
    def test_embedding(self, text="hello world"):
        print(self.embed.embed_query(text))

In [4]:
from langchain_community.vectorstores import Chroma

class VectorStore:
    def __init__(self, chunks, embedding, persist_directory="./db"):
        self.vectorstore = Chroma.from_documents(
            documents=chunks,
            embedding=embedding,
            persist_directory=persist_directory,
            collection_name="rag_collection"
        )

    def info(self):
        print(f"Vector store created with {self.vectorstore._collection.count()} vectors")


In [None]:
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.chat_models import init_chat_model
from dotenv import load_dotenv
import os
from langchain_classic.chains import create_retrieval_chain, create_history_aware_retriever

from langchain_classic.chains.combine_documents import create_stuff_documents_chain

load_dotenv()
groq_key = os.getenv("GROQ_API_KEY")
if groq_key:
    os.environ["GROQ_API_KEY"] = groq_key


class Chain:
    def __init__(self, model_name: "groq:llama-3.1-8b-instant"):
        self.llm = init_chat_model(model=model_name)
        self.history_store={}

    def llmtest(self, prompt="Hello how are you?"):
        print(self.llm.invoke(prompt).content)


    def __ragChain__(self, retriever):
        contextualize_q_system_prompt = """Given a chat history and the latest user question 
        which might reference context in the chat history, formulate a standalone question 
        which can be understood without the chat history. Do NOT answer the question, 
        just reformulate it if needed and otherwise return it as is."""

        contextualize_q_prompt = ChatPromptTemplate.from_messages([
            ("system", contextualize_q_system_prompt),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{input}"),
        ])
                
        history_aware_retriever = create_history_aware_retriever(
            self.llm, retriever, contextualize_q_prompt
        )

        qa_system_prompt = """You're an e-commerce bot answering product-related queries using reviews and titles.
        Use the following pieces of retrieved context to answer the question in short. Only write short and concise answers 
        If you don't know the answer, just say that you don't know. 

        Context: {context}"""

        qa_prompt = ChatPromptTemplate.from_messages([
            ("system", qa_system_prompt),
            MessagesPlaceholder(variable_name="chat_history"),
            ("human", "{input}"),
        ])
        question_answer_chain = create_stuff_documents_chain(self.llm, qa_prompt)
        
        conversational_rag_chain = create_retrieval_chain(
            history_aware_retriever, 
            question_answer_chain
        )
        
        return RunnableWithMessageHistory(conversational_rag_chain, self._getHistory,
            input_messages_key="input",
            history_messages_key="chat_history",
            output_messages_key="answer"
        )


    def _getHistory(self,session_id:str) -> BaseChatMessageHistory:
        if session_id not in self.history_store:
            self.history_store[session_id] = ChatMessageHistory()
        return self.history_store[session_id]
    


In [14]:
splitter=Splitter(dataloader.documents)

print(f"Created {len(splitter.chunks)} chunks from {len(dataloader.documents)} documents")
print(f"\nChunk example:")
print(f"Content: {splitter.chunks[0].page_content[:150]}...")
print(f"Metadata: {splitter.chunks[0].metadata}")

Embedder = Embed("sentence-transformers/all-mpnet-base-v2")
print(f"Embed: {Embedder.test_embedding('Hello world')}")


vectorstore = VectorStore(splitter.chunks, Embedder.embed, "./db")
retriever=vectorstore.vectorstore.as_retriever(search_kwarg={"k":3} )


Created 591 chunks from 450 documents

Chunk example:
Content: product_id: ACCFZGAQJGYCYDCM
product_title: BoAt Rockerz 235v2 with ASAP charging Version 5.0 Bluetooth Headset
rating: 5
summary: Terrific purchase...
Metadata: {'source': 'data/flipkart_product_review.csv', 'row': 0}
[0.06296711415052414, 0.009567681699991226, -0.03545382246375084, 0.014043353497982025, -0.018237454816699028, -0.007566687185317278, 0.023433687165379524, 0.03624479100108147, 0.030070124194025993, -0.038829974830150604, 0.021599069237709045, -0.0234239362180233, -0.012908977456390858, -0.010738784447312355, 0.04577728733420372, -0.0926717147231102, 0.05749331787228584, 0.009452620521187782, -0.011804714798927307, 0.022536545991897583, 0.01559105422347784, 0.024114087224006653, 0.03057534620165825, 0.028718991205096245, 0.021169887855648994, 0.0023792246356606483, -0.03334937244653702, 0.006367149762809277, 0.04073214903473854, -0.03829704597592354, -0.03688373416662216, -0.0033579429145902395, 0.02402672730

In [21]:
rag_chain = Chain(model_name= "groq:llama-3.1-8b-instant").__ragChain__(retriever)

user_input = "What is the best product for gaming under 20k?"

reponse = rag_chain.invoke(
    {"input" : user_input},
    config={"configurable" : {"session_id" : "user-session"}}
)["answer"]
reponse

"I don't know."

In [20]:
user_input = "What is the best price for BoAt Rockerz 235v2?"

reponse = rag_chain.invoke(
    {"input" : user_input},
    config={"configurable" : {"session_id" : "user-session"}}
)["answer"]
reponse

'I don\'t know the current price, but users have mentioned it is a "super" value for money in the given reviews.'

In [None]:
user_input = "What is BoAt Rockerz ?"

reponse = rag_chain.invoke(
    {"input" : user_input},
    config={"configurable" : {"session_id" : "user-session"}}
)["answer"]
reponse

'BoAt Rockerz is a Bluetooth Headset.'

In [23]:
user_input = "Tell me the features of it."

reponse = rag_chain.invoke(
    {"input" : user_input},
    config={"configurable" : {"session_id" : "user-session"}}
)["answer"]
reponse

"I don't have information about BoAt Rockerz. The context provided earlier is related to realme Buds Wireless Bluetooth Headset."

In [24]:
user_input = "What question did I ask earlier"

reponse = rag_chain.invoke(
    {"input" : user_input},
    config={"configurable" : {"session_id" : "user-session"}}
)["answer"]
reponse

'You asked about the features of the BoAt Rockerz. \n\nFrom the context, I can tell that it has: \n\n- ASAP charging\n- Bluetooth 5.0\n- Vibration motor for calls\n- Enables simultaneous device connection'