In [None]:
# ============================================================
# PROJECT OVERVIEW — Fictional Retail Co. RAG Demo
# ============================================================
# This project builds a simple Retrieval-Augmented Generation (RAG) system
# using only free and open-source tools.
#
# RAG means we first "retrieve" the most relevant text from documents,
# and then "generate" an answer using a language model.
#
# Example:
# Fictional Retail Co. has internal documents.
# We want to ask questions like:
#   - What is the return policy?
#   - How can customers reach support?
# and get accurate answers from those documents.
#
# ============================================================
# SIMPLE EXPLANATION OF EACH STEP
# ============================================================
#
# Step 1 — Install the Tools
# We start by installing all the computer “tools” (Python libraries) we’ll need.
# These include:
# - Transformers and Sentence-Transformers → for understanding and generating text.
# - FAISS → a super-fast search engine for finding similar text.
# - PyPDF2 → to read PDF files.
# - Gradio → to build a simple chatbot interface.
#
# Think of this step as putting all your ingredients on the kitchen counter before cooking.
#
# Step 2 — Import the Tools
# Now that everything is installed, we tell Python we want to use them in our code using "import".
# We also set up where our data (PDFs) is stored — in a folder called /content/fictional_retail_docs.
#
# Step 3 — Read & Split the Documents
# We open each PDF and extract the text.
# Then we split the big text into smaller chunks (like cutting a large paragraph into smaller paragraphs).
#
# Why? Because the model can’t read super long text at once — it’s like giving it a few sentences at a time to focus better.
# We use:
#   - chunk_size = 1000 characters (each piece)
#   - overlap = 200 (to give some overlap so it remembers context)
#
# Step 4 — Turn Text into Numbers (Embeddings) & Build a Search Index
# Computers don’t understand words directly — they understand numbers.
# So, we use a model called all-MiniLM-L12-v2 to convert each text chunk into a set of numbers (called embeddings).
# Then we store all these embeddings in FAISS, a kind of searchable “brain” that can quickly find which text chunks
# are most similar to a question.
#
# Step 5 — Load the Answer Generator (FLAN-T5)
# Now we load a text generation model — flan-t5-base — from Google.
# This model can read a question and some context, and then write a short, clear answer.
#
# Step 6 — Ask Questions Safely (RAG Query Function)
# Here we connect everything together.
# When we ask a question:
#   1. The system searches FAISS to find the most relevant chunks (top-k).
#   2. It limits how much text is passed to the model (to avoid errors).
#   3. It gives this text to FLAN-T5 and asks it to generate an answer.
#
# Step 7 — Test the System
# We test it with a few sample questions — one from each file.
# This helps confirm that the model is reading the right chunks and giving sensible answers.
# It also prints which PDF file the answer came from, so you know the source.
#
# Step 8 — Make It Interactive with Gradio
# Finally, we build a simple chat interface where anyone can type a question and get an answer.
# This makes your project more user-friendly — like a mini AI chatbot for Fictional Retail Co.
#
# ============================================================
# SYSTEM FLOW
# ============================================================
# 1. Load and split documents
# 2. Turn text into embeddings
# 3. Store embeddings in FAISS
# 4. When user asks a question:
#       → Find most similar text pieces
#       → Give them to the model
#       → Generate and return the answer
# 5. Optional: Ask more questions via the Gradio interface
#
# ============================================================
# TOOLS USED (All Free)
# ============================================================
# - SentenceTransformer: all-MiniLM-L12-v2 (for embeddings)
# - Hugging Face Model: google/flan-t5-base (for generating answers)
# - FAISS: for searching similar text
# - PyPDF2: for reading PDFs
# - Gradio: for the chatbot interface
#
# Everything in this project is open-source and 100% free.
# No paid API keys or cloud accounts are needed.
# ============================================================


# =========================
# Step 1: Install Libraries
# =========================
!pip install --quiet sentence-transformers transformers faiss-cpu PyPDF2 gradio

# =========================
# Step 2: Import Libraries
# =========================
import os
import faiss
import numpy as np
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import gradio as gr

# Folder containing your documents
folder_path = "/content/fictional_retail_docs"

# =========================
# Step 3: Read and Split PDFs
# =========================
def chunk_text(text, chunk_size=1000, overlap=200):
    """Split long text into smaller overlapping chunks for better processing."""
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

all_chunks = []
file_names = []

for filename in os.listdir(folder_path):
    if filename.lower().endswith(".pdf"):
        pdf = PdfReader(os.path.join(folder_path, filename))
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text() or ""
            text += page_text.strip() + " "
        chunks = chunk_text(text)
        all_chunks.extend(chunks)
        file_names.extend([filename] * len(chunks))

print(f"Loaded {len(all_chunks)} text chunks from {len(os.listdir(folder_path))} PDFs.")

# =========================
# Step 4: Create Embeddings and Build FAISS Index
# =========================
embed_model = SentenceTransformer("all-MiniLM-L12-v2")
embeddings = embed_model.encode(all_chunks, convert_to_numpy=True, show_progress_bar=True)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("FAISS index created and populated with document chunks.")

# =========================
# Step 5: Load the Answer Generation Model
# =========================
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
gen_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# =========================
# Step 6: Define the RAG Query Function
# =========================
def rag_query(query, k=2, max_new_tokens=200):
    """Retrieve top-k relevant chunks and generate an answer."""
    query_emb = embed_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_emb, k)
    retrieved_chunks = [all_chunks[i] for i in indices[0]]
    sources = [file_names[i] for i in indices[0]]

    # Limit total context size to avoid long input issues
    context = " ".join(retrieved_chunks)
    if len(context) > 3500:
        context = context[:3500]

    # Build the prompt for FLAN-T5
    prompt = f"Answer the question based only on the context below:\n\nContext:\n{context}\n\nQuestion: {query}"
    response = gen_pipeline(prompt, max_new_tokens=max_new_tokens, do_sample=False)[0]['generated_text']

    return {"answer": response, "sources": sources}

# =========================
# Step 7: Test the System
# =========================
sample_questions = {
    "Returns": "What is the return policy?",
    "Warranty": "How long is the warranty period?",
    "Customer Support": "How can customers contact support?",
    "Loyalty": "How does the loyalty program work?"
}

print("\nTesting RAG system...\n")
for topic, question in sample_questions.items():
    result = rag_query(question)
    print(f"Question ({topic}): {question}")
    print(f"Answer: {result['answer']}")
    print(f"Source Documents: {result['sources']}")
    print("-" * 80)

# =========================
# Step 8: Gradio Interface
# =========================
def ask_rag(query):
    result = rag_query(query)
    answer = result["answer"]
    source = ", ".join(set(result["sources"]))
    return f"Answer: {answer}\n\nSource Documents: {source}"

demo = gr.Interface(fn=ask_rag, inputs="text", outputs="text", title="Fictional Retail Co. RAG Assistant")
demo.launch(share=False)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h

FileNotFoundError: [Errno 2] No such file or directory: '/content/fictional_retail_docs'