<a href="https://colab.research.google.com/github/ray-islam/generativeAI/blob/main/Portfolio_Project_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# =========================
# 0) Minimal installs
# =========================
!pip -q uninstall -y transformers accelerate peft 2>/dev/null
!pip -q install -U pypdf faiss-cpu sentence-transformers

# =========================
# 1) Imports
# =========================
import os, re, textwrap
import numpy as np
from dataclasses import dataclass
from typing import List, Tuple, Optional

from pypdf import PdfReader
import faiss
from sentence_transformers import SentenceTransformer

# =========================
# 2) Helpers
# =========================
@dataclass
class Chunk:
    page: int
    text: str

def clean_text(t: str) -> str:
    t = t.replace("\x00", " ")
    t = re.sub(r"\s+", " ", t).strip()
    return t

def list_pdfs(folder="/content") -> List[str]:
    return [os.path.join(folder, f) for f in os.listdir(folder) if f.lower().endswith(".pdf")]

def extract_pdf_text(pdf_path: str, max_pages: Optional[int] = None) -> List[Tuple[int, str]]:
    reader = PdfReader(pdf_path)
    n = len(reader.pages) if max_pages is None else min(len(reader.pages), max_pages)
    pages = []
    for i in range(n):
        text = reader.pages[i].extract_text() or ""
        text = clean_text(text)
        if text:
            pages.append((i + 1, text))  # 1-indexed page numbers
    return pages

def chunk_pages(pages: List[Tuple[int, str]], chunk_size: int = 900, overlap: int = 150) -> List[Chunk]:
    chunks: List[Chunk] = []
    for page_num, text in pages:
        start = 0
        while start < len(text):
            end = min(len(text), start + chunk_size)
            chunk = clean_text(text[start:end])
            if len(chunk) >= 200:  # drop tiny fragments
                chunks.append(Chunk(page=page_num, text=chunk))
            if end >= len(text):
                break
            start = max(0, end - overlap)
    return chunks

def build_index(chunks: List[Chunk], embed_model_name="sentence-transformers/all-MiniLM-L6-v2"):
    emb_model = SentenceTransformer(embed_model_name)
    texts = [c.text for c in chunks]
    embeddings = emb_model.encode(texts, normalize_embeddings=True, show_progress_bar=True)
    embeddings = np.asarray(embeddings, dtype=np.float32)

    index = faiss.IndexFlatIP(embeddings.shape[1])  # cosine sim via normalized vectors
    index.add(embeddings)
    return emb_model, embeddings, index

def retrieve(question: str, emb_model, index, chunks: List[Chunk], top_k: int = 5):
    q = emb_model.encode([question], normalize_embeddings=True)
    q = np.asarray(q, dtype=np.float32)
    scores, idx = index.search(q, top_k)

    results = []
    for s, i in zip(scores[0], idx[0]):
        if i == -1:
            continue
        results.append((float(s), chunks[int(i)]))
    return results

def pretty_print_results(question: str, results, width: int = 110, preview_chars: int = 900):
    print("="*width)
    print("QUESTION:", question)
    print("="*width)
    print("\nTOP EVIDENCE PASSAGES (with page citations):\n")
    for score, ch in results:
        print(f"--- Page {ch.page} | score={score:.3f} ---")
        snippet = ch.text[:preview_chars]
        print(textwrap.fill(snippet, width=width))
        print()
    pages = sorted({ch.page for _, ch in results})
    print("Evidence pages:", pages)
    print("="*width)

# =========================
# 3) Choose your PDF automatically (or set it manually)
# =========================
pdfs = list_pdfs("/content")
if not pdfs:
    raise FileNotFoundError("No PDFs found in /content. Upload a PDF to Colab, then re-run this cell.")

print("PDFs found in /content:")
for i, p in enumerate(pdfs):
    print(f"[{i}] {os.path.basename(p)}")

PDF_INDEX = 0  # <-- change this number to pick a different PDF
PDF_FILE = pdfs[PDF_INDEX]
print("\nUsing PDF:", PDF_FILE)

# Optional: speed up demos by limiting pages (e.g., 40)
MAX_PAGES = None  # set to 40 for faster demo

# =========================
# 4) Build the RAG store
# =========================
print("\nExtracting text...")
pages = extract_pdf_text(PDF_FILE, max_pages=MAX_PAGES)
print(f"Extracted {len(pages)} text pages")

print("Chunking...")
chunks = chunk_pages(pages, chunk_size=900, overlap=150)
print(f"Created {len(chunks)} chunks")

print("Embedding + indexing (FAISS)...")
emb_model, embeddings, index = build_index(chunks)
print("✅ RAG index ready.")
print(f"Embedding dim: {embeddings.shape[1]}")

# =========================
# 5) Ask questions interactively
# =========================
def ask(question: str, top_k: int = 6):
    results = retrieve(question, emb_model, index, chunks, top_k=top_k)
    pretty_print_results(question, results)
    return results




PDFs found in /content:
[0] supplement16.pdf
[1] SNAPChurning.pdf
[2] gininarrativeexamples5-2-14.pdf
[3] supplement09.pdf
[4] Pew-Philadelphia-Business-Dashboard-Methodology.pdf
[5] supplement21.pdf
[6] hcieducationaattainmentnarrative3558-8-17-ada.pdf
[7] 9789264091504-en.pdf
[8] supplement24.pdf

Using PDF: /content/supplement16.pdf

Extracting text...
Extracted 452 text pages
Chunking...
Created 1904 chunks
Embedding + indexing (FAISS)...


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

✅ RAG index ready.
Embedding dim: 384


In [5]:
# Example questions (edit these):
ask("Find the section that describes the purpose of the Annual Statistical Supplement 2016.", top_k=10)

QUESTION: Find the section that describes the purpose of the Annual Statistical Supplement 2016.

TOP EVIDENCE PASSAGES (with page citations):

--- Page 3 | score=0.545 ---
d statistics can be found in the SSI Annual Statistical Report and the Annual Statistical Report on the Social
Security Disability Insurance Program . The Supplement has been published annually since 1940. Decisions
affecting the future of Social Security are facilitated by the availability of relevant data over a long
period. The data provide a base for research, policy analy- sis, and proposals for changing the programs. In
addition to meeting the Social Security Administration’s information needs, the Supplement strengthens the
agency’s ability to respond to requests for program data from congressional committees, government agencies at
all levels, and the research community. The Supplement is prepared by Social Security Administration staff
from various components throughout the agency and by individuals from ot

[(0.5453441143035889,
  Chunk(page=3, text='d statistics can be found in the SSI Annual Statistical Report and the Annual Statistical Report on the Social Security Disability Insurance Program . The Supplement has been published annually since 1940. Decisions affecting the future of Social Security are facilitated by the availability of relevant data over a long period. The data provide a base for research, policy analy- sis, and proposals for changing the programs. In addition to meeting the Social Security Administration’s information needs, the Supplement strengthens the agency’s ability to respond to requests for program data from congressional committees, government agencies at all levels, and the research community. The Supplement is prepared by Social Security Administration staff from various components throughout the agency and by individuals from other federal agencies. I would like to express my thanks to them for their')),
 (0.5276628136634827,
  Chunk(page=53, text='Annual