In [1]:
import pandas as pd
from pathlib import Path
from typing import List
import chromadb
from chromadb.config import Settings
from langchain_core.documents import Document
from langchain_community.document_loaders import CSVLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("C:\\Users\\PUSPITA\\OneDrive\\Documents\\ML_Project\\RAG\\data\\text_files\\amazon_vfl_reviews.csv")  # file name must match
df['name'].unique()


array(['Mamaearth-Onion-Growth-Control-Redensyl',
       'Mamaearth-Percent-Natural-Berry-Toothpaste',
       'Mamaearth-Natural-Turmeric-Saffron-brightning',
       'Mamaearth-Illuminate-Vitamin-Radiant-Turmeric',
       'Mamaearth-Blemishes-Pigmentation-Blemish-Mulberry',
       'Mamaearth-Face-Wash-100-ml',
       'Mamaearth-Moisturizing-Baby-Bathing-Oatmeal',
       'Godrej-Protekt-Master-Blaster-Handwash',
       'Godrej-No-1-Bathing-Soap-Lime',
       'Godrej-No-1-Bathing-Soap-Turmeric',
       'Godrej-Direct-Cool-Refrigerator-1905-PTDI',
       'Godrej-aer-Pocket-Bathroom-Fragrance',
       'Godrej-Security-Solutions-SEEC9060-Electronic',
       'Godrej-Matic-Spray-Violet-Valley',
       'Godrej-Security-Solutions-Goldilocks-Personal',
       'Godrej-ViroShield-Disinfecting-VIROSHIELD-30UV',
       'Titan-Analog-Black-Dial-Watch-1805NM01',
       'Titan-Analog-White-Dial-Watch-NK1639SL03',
       'Titan-Karishma-Analog-Blue-Watch-1774SM01',
       'Titan-Karishma-Analog-Black-Wa

In [3]:
def process_all_csvs(csv_directory: str):
    documents = []

    csv_path = Path(csv_directory)
    csv_files = list(csv_path.glob("**/*.csv"))

    print(f"Found {len(csv_files)} CSV files")

    encodings_to_try = ["utf-8", "utf-16", "latin-1", "cp1252"]

    for csv_file in csv_files:
        loaded = False

        for enc in encodings_to_try:
            try:
                loader = CSVLoader(
                    file_path=str(csv_file),
                    encoding=enc,
                    csv_args={
                        "delimiter": ",",
                        "quotechar": '"',
                        "skipinitialspace": True
                    }
                )
                docs = loader.load()
                documents.extend(docs)
                print(f"Loaded {len(docs)} rows from {csv_file.name} ({enc})")
                loaded = True
                break

            except Exception:
                continue

        if not loaded:
            print(f"⚠️ Skipped file (encoding issue): {csv_file.name}")

    print("Total documents loaded:", len(documents))
    return documents


In [4]:
csv_directory = "../data/text_files"

documents = process_all_csvs(csv_directory)


chunks = documents

print("Total chunks:", len(chunks))


Found 1 CSV files
Loaded 2782 rows from amazon_vfl_reviews.csv (utf-8)
Total documents loaded: 2782
Total chunks: 2782


In [5]:
from sentence_transformers import SentenceTransformer

class EmbeddingManager:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def generate_embeddings(self, texts: List[str]):
        if not texts:
            return []
        return self.model.encode(texts, show_progress_bar=False).tolist()


In [6]:
client = chromadb.Client(
    Settings(
        persist_directory="./chroma_db",
        anonymized_telemetry=False
    )
)

collection = client.get_or_create_collection(
    name="csv_rag_collection"
)


In [7]:
embedding_manager = EmbeddingManager()

BATCH_SIZE = 1000

for i in range(0, len(chunks), BATCH_SIZE):
    batch_docs = chunks[i:i + BATCH_SIZE]

    texts = [doc.page_content for doc in batch_docs]
    metadatas = [doc.metadata for doc in batch_docs]
    ids = [f"doc_{i+j}" for j in range(len(batch_docs))]

    embeddings = embedding_manager.generate_embeddings(texts)

    if not embeddings:
        print(f"Skipped empty batch at index {i}")
        continue

    assert len(texts) == len(embeddings) == len(ids)

    collection.add(
        documents=texts,
        embeddings=embeddings,
        metadatas=metadatas,
        ids=ids
    )

    print(f"Inserted {i + len(batch_docs)} / {len(chunks)}")

print("Vector database stored successfully")



Inserted 1000 / 2782
Inserted 2000 / 2782
Inserted 2782 / 2782
Vector database stored successfully


In [8]:
def retrieve_documents(query: str, k: int = 4):
    results = collection.query(
        query_texts=[query],
        n_results=k
    )

    docs = []
    for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
        docs.append(Document(page_content=doc, metadata=meta))

    return docs


In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0,
    api_key="Your_Groq_api_key"
)


In [10]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Use the following context to answer the question.
If the answer is not present, say "I don't know".

Context:
{context}

Question:
{question}
Overall Ratings(out of 10):

Overall product quality:

"""
)


In [11]:
def rag_answer(query: str):
    docs = retrieve_documents(query)

    context = "\n\n".join([doc.page_content for doc in docs])

    final_prompt = prompt.format(
        context=context,
        question=query,
        
    )

    response = llm.invoke(final_prompt)
    return response.content


In [12]:
query = "What is the product quality of Mysore-Whitening-Rejuvenating-Face-Pack  ?"
answer = rag_answer(query)
print(answer)


Based on the given reviews, the overall product quality of Mysore-Whitening-Rejuvenating-Face-Pack is very poor. 

There are two reviews with a rating of 1 out of 5, both of which mention negative experiences with the product. One review mentions that the product is burning on the face and damaged the existing skin tone, while the other review states that the product is "worthless money" and advises not to buy it.

Since there are no positive reviews or ratings, the overall product quality is 0 out of 10.
