<a href="https://colab.research.google.com/github/rajaranjith/HCL-GenAI-Training/blob/main/Capstone%20project%202%20-%20SilverBadge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Install Python packages
!pip -q install -U langchain langchain-community chromadb sentence-transformers pypdf langchain_text_splitters

# Show versions (optional)
import sys, pkgutil
print("Python:", sys.version)
for p in ["langchain","langchain_community","chromadb","sentence_transformers","pypdf", "langchain_text_splitters"]:
    m = pkgutil.get_loader(p)
    print(f"{p}: {'OK' if m else 'MISSING'}")

Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
langchain: OK
langchain_community: OK
chromadb: OK
sentence_transformers: OK
pypdf: OK
langchain_text_splitters: OK


  m = pkgutil.get_loader(p)


In [4]:
%%bash
# Install Ollama
curl -fsSL https://ollama.com/install.sh | sh

# Start the Ollama server in the background
nohup ollama serve > /content/ollama.log 2>&1 &

# Give the server a moment to start
sleep 5

# Pull a smaller Llama 3 model to fit Colab constraints.
# If you hit memory/disk issues, use 'llama3.2:1b' instead of '3b'.
ollama pull 'llama3.2:3b'

# (Optional) Verify server is up
curl -s http://localhost:11434/api/tags

{"models":[{"name":"llama3.2:3b","model":"llama3.2:3b","modified_at":"2025-12-10T09:15:58.248855357Z","size":2019393189,"digest":"a80c4f17acd55265feec403c7aef86be0c25983ab279d83f3bcd3abbcb5b8b72","details":{"parent_model":"","format":"gguf","family":"llama","families":["llama"],"parameter_size":"3.2B","quantization_level":"Q4_K_M"}}]}

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
#=#=#                                                                         ##O#-#                                                                        ##O=#  #                                                                      #=#=-#  #                                                                                                                                                0.0%                                                                           0.1%                                                                           0.1%                                                                           0.3%                                                                           0.4%                                                                           0.6%                                                                           0.8%                                                    

In [17]:

import os
from typing import List

from pypdf import PdfReader
from langchain_core.documents import Document # Changed import path
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_classic.chains import RetrievalQA # Changed import path

from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.llms import Ollama

# Configuration
PERSIST_DIR = "/content/chroma_db"
HANG_TIME = 10 # Seconds to wait for Ollama to start
OLLAMA_BASE_URL = "http://localhost:11434"  # local Ollama API
DEFAULT_MODEL = "llama3.2:1b"               # change to 'llama3.2:3b' if resources allow


def load_document(doc_path: str) -> List[Document]:
    """Load a PDF or text document into LangChain Document objects."""
    if not os.path.exists(doc_path):
        raise FileNotFoundError(f"Document not found: {doc_path}")

    ext = os.path.splitext(doc_path)[1].lower()
    docs: List[Document] = []

    if ext == ".pdf":
        reader = PdfReader(doc_path)
        for i, page in enumerate(reader.pages):
            try:
                text = page.extract_text() or ""
            except Exception:
                text = ""
            if text.strip():
                docs.append(
                    Document(
                        page_content=text,
                        metadata={"source": doc_path, "page": i + 1},
                    )
                )
    elif ext in [".txt", ".md", ".rtf"]:
        with open(doc_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
        docs.append(Document(page_content=text, metadata={"source": doc_path}))
    else:
        raise ValueError(f"Unsupported file extension: {ext}")

    if not docs:
        raise ValueError("No extractable text found in the document (is it scanned?).")
    return docs


def chunk_documents(docs: List[Document], chunk_size=1000, chunk_overlap=150) -> List[Document]:
    """Split documents into overlapping chunks for retrieval."""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    return splitter.split_documents(docs)


def build_or_load_chroma(chunks: List[Document], persist_dir: str = PERSIST_DIR) -> Chroma:
    """Build a new Chroma index or load an existing one from disk."""
    os.makedirs(persist_dir, exist_ok=True)
    embedder = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    existing = len(os.listdir(persist_dir)) > 0
    if existing:
        print(f"[INFO] Loading existing Chroma DB from: {persist_dir}")
        vectordb = Chroma(persist_directory=persist_dir, embedding_function=embedder)
    else:
        print(f"[INFO] Building new Chroma DB at: {persist_dir}")
        vectordb = Chroma.from_documents(chunks, embedder, persist_directory=persist_dir)
        vectordb.persist()
    return vectordb


def get_ollama_llm(model: str = DEFAULT_MODEL, temperature: float = 0.1) -> Ollama:
    """Configure Ollama LLM (Colab-local)."""
    print(f"[INFO] Using Ollama model '{model}' at {OLLAMA_BASE_URL}")
    return Ollama(model=model, base_url=OLLAMA_BASE_URL, temperature=temperature)


# --- The RAG chain builder (clean & documented) ---
from langchain_classic.chains import RetrievalQA # Changed import path
from langchain_core.prompts import PromptTemplate

def build_rag_chain(llm, vectordb, k: int = 4) -> RetrievalQA:
    """
    Build a Retrieval-Augmented Generation (RAG) chain using:
    - A local LLM (Ollama with Llama 3)
    - A Chroma vector database for document retrieval
    - A custom prompt template to ground answers in retrieved context

    Args:
        llm: The language model instance (Ollama LLM object).
        vectordb: The Chroma vector store containing document embeddings.
        k (int): Number of top chunks to retrieve for each query.

    Returns:
        RetrievalQA: A LangChain RetrievalQA chain ready to answer questions.
    """
    retriever = vectordb.as_retriever(search_kwargs={"k": k})
    prompt_template = """
You are a helpful assistant. Use ONLY the following context to answer the question.
If the answer is not in the context, say: "I don't see that in the provided document."
Keep answers concise and cite source page(s) when possible.

Context:
{context}

Question:
{question}

Answer:
"""
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",  # simplest approach: stuff retrieved docs into the prompt
        chain_type_kwargs={"prompt": prompt},
        return_source_documents=True,
    )
    return chain


def clear_dir(path: str) -> None:
    """Recursively delete directory contents (used for reindexing)."""
    if not os.path.exists(path):
        return
    for entry in os.listdir(path):
        full = os.path.join(path, entry)
        try:
            if os.path.isdir(full):
                for root, dirs, files in os.walk(full, topdown=False):
                    for name in files:
                        os.remove(os.path.join(root, name))
                    for name in dirs:
                        os.rmdir(os.path.join(root, name))
                os.rmdir(full)
            else:
                os.remove(full)
        except Exception as e:
            print(f"[WARN] Couldn't remove {full}: {e}")

In [18]:

from google.colab import files

print("Please upload a PDF or TXT file...")
uploaded = files.upload()  # UI prompt

if not uploaded:
    raise RuntimeError("No file uploaded.")

# Save first uploaded file
fname = next(iter(uploaded))
with open(fname, "wb") as f:
    f.write(uploaded[fname])

doc_path = f"/content/{fname}"
print("Saved to:", doc_path)

# Load and chunk
docs = load_document(doc_path)
chunks = chunk_documents(docs)
print(f"[INFO] Loaded {len(docs)} doc(s), created {len(chunks)} chunks.")

Please upload a PDF or TXT file...


KeyboardInterrupt: 

In [None]:
# If you need a fresh index (e.g., after re-uploading/revised doc), uncomment:
# clear_dir(PERSIST_DIR)

vectordb = build_or_load_chroma(chunks, PERSIST_DIR)
llm = get_ollama_llm(DEFAULT_MODEL, temperature=0.1)
rag_chain = build_rag_chain(llm, vectordb, k=4)

print("✅ RAG chain ready.")

In [None]:
def ask(question: str):
    """Ask a question grounded on your uploaded document."""
    res = rag_chain.invoke({"query": question})
    print("\n--- Question ---")
    print(question)
    print("\n--- Answer ---")
    print(res["result"])
    print("\n--- Sources ---")
    for d in res.get("source_documents", []):
        src = d.metadata.get("source")
        page = d.metadata.get("page")
        if page:
            print(f"{src} | page {page}")
        else:
            print(f"{src}")

# Try a few queries:
ask("Summarize the key points in section 3 and list action items.")
ask("What is the project deadline mentioned? Cite the page.")
ask("Extract all email addresses found in the document.")