<a href="https://colab.research.google.com/github/samantaaritra/AICodePro/blob/main/PDF_AI_AGENT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-genai faiss-cpu sentence-transformers PyMuPDF tqdm


Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF, faiss-cpu
Successfully installed PyMuPDF-1.26.4 faiss-cpu-1.12.0


In [None]:
import os
os.environ["GEMINI_API_KEY"] = "AIzaSyCSkawLX8vBcvBCa5ctx_JhQbX0MbjEYPQ"


In [None]:
from google.colab import files
uploaded = files.upload("")   # select your 500-page manual

Saving draft-oasis-e1-manual-04-28-2024.pdf to draft-oasis-e1-manual-04-28-2024.pdf


In [None]:
import shutil
shutil.move(list(uploaded.keys())[0], "manual.pdf")


'manual.pdf'

In [None]:
# ingest.py (inline in Colab)

import os, re, json, numpy as np
from tqdm import tqdm
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import faiss

PDF_PATH = "manual.pdf"
INDEX_DIR = "index_data"
EMBED_MODEL_NAME = "all-MiniLM-L6-v2"
CHUNK_SIZE_TOKENS = 800
CHUNK_OVERLAP_TOKENS = 160
BATCH_SIZE = 64

os.makedirs(INDEX_DIR, exist_ok=True)

def approx_tokens(text): return max(1, len(text)//4)

def chunk_text_with_overlap(text, page, heading=None):
    words = text.split()
    chunks, start = [], 0
    while start < len(words):
        end = start + CHUNK_SIZE_TOKENS
        chunk_text = " ".join(words[start:end])
        chunks.append({"text": chunk_text, "page": page, "heading": heading})
        start = max(0, end - CHUNK_OVERLAP_TOKENS)
        if end >= len(words): break
    return chunks

def extract_pdf_with_headings(path):
    doc = fitz.open(path); pages=[]
    for num, page in enumerate(doc,1):
        blocks=page.get_text("dict")["blocks"]; content=[]
        for b in blocks:
            if "lines" not in b: continue
            for l in b["lines"]:
                for s in l["spans"]:
                    t=s["text"].strip()
                    if t: content.append({"text":t,"size":s["size"]})
        pages.append({"page":num,"content":content})
    return pages

def segment_headings(pages):
    secs=[]
    for p in pages:
        sizes=[c["size"] for c in p["content"]]
        if not sizes: continue
        med=np.median(sizes)
        cur_heading, buf=None,[]
        for c in p["content"]:
            if c["size"]>med*1.2 and len(c["text"].split())<15:
                if buf:
                    secs.append({"heading":cur_heading,"page":p["page"],"text":" ".join(buf)})
                    buf=[]
                cur_heading=c["text"]
            else: buf.append(c["text"])
        if buf: secs.append({"heading":cur_heading,"page":p["page"],"text":" ".join(buf)})
    return secs

def build_chunks(secs):
    allc=[]
    for s in secs:
        for i,c in enumerate(chunk_text_with_overlap(s["text"],s["page"],s["heading"])):
            c["id"]=f"p{s['page']}_c{i+1}"
            allc.append(c)
    return allc

pages=extract_pdf_with_headings(PDF_PATH)
sections=segment_headings(pages)
chunks=build_chunks(sections)

with open(f"{INDEX_DIR}/chunks.jsonl","w",encoding="utf8") as f:
    for c in chunks: f.write(json.dumps(c,ensure_ascii=False)+"\n")

model=SentenceTransformer(EMBED_MODEL_NAME)
dim=model.get_sentence_embedding_dimension()
vecs=np.zeros((len(chunks),dim),dtype="float32"); ids=[]
for i in tqdm(range(0,len(chunks),BATCH_SIZE)):
    batch=chunks[i:i+BATCH_SIZE]
    embs=model.encode([x["text"] for x in batch],convert_to_numpy=True,normalize_embeddings=True)
    vecs[i:i+len(batch)] = embs; ids.extend([x["id"] for x in batch])
index=faiss.IndexFlatIP(dim); index.add(vecs)
faiss.write_index(index,f"{INDEX_DIR}/faiss.index")
json.dump(ids,open(f"{INDEX_DIR}/ids.json","w"))
print("✅ Ingest complete")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 6/6 [00:02<00:00,  2.77it/s]

✅ Ingest complete





In [None]:
import os
import json
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from google import genai
from typing import List

# -------------------------
# Config
# -------------------------
INDEX_DIR = "index_data"
EMBED_MODEL = "all-MiniLM-L6-v2"
TOP_K = 5

# -------------------------
# Load FAISS + metadata
# -------------------------
index = faiss.read_index(f"{INDEX_DIR}/faiss.index")
with open(f"{INDEX_DIR}/ids.json", "r", encoding="utf8") as f:
    ids = json.load(f)
with open(f"{INDEX_DIR}/chunks.jsonl", "r", encoding="utf8") as f:
    chunks = {json.loads(line)["id"]: json.loads(line) for line in f}

embedder = SentenceTransformer(EMBED_MODEL)

# -------------------------
# Gemini client
# -------------------------
if "GEMINI_API_KEY" not in os.environ:
    raise ValueError("⚠️ Please set GEMINI_API_KEY first in Colab!")

genai_client = genai.Client(api_key=os.environ["GEMINI_API_KEY"])

# -------------------------
# Retrieval
# -------------------------
def retrieve(query: str, top_k=TOP_K):
    q_emb = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    D, I = index.search(q_emb, top_k)
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx < 0:
            continue
        doc_id = ids[idx]
        doc = chunks[doc_id]
        results.append({
            "id": doc_id,
            "page": doc.get("page"),
            "heading": doc.get("heading"),
            "text": doc["text"],
            "score": float(score)
        })
    return results

# -------------------------
# Prompt builder
# -------------------------
def build_prompt(query: str, retrieved: List[dict]):
    context = "\n\n---\n\n".join([
        f"[{r['id']} | page {r['page']} | heading: {r['heading']}]\n{r['text']}"
        for r in retrieved
    ])
    prompt = f"""
You are a helpful assistant answering questions about a technical manual.
Only use the CONTEXT below. If the answer is not present, say:
"I don't know based on the manual."

Always cite sources inline using their IDs and page numbers.

CONTEXT:
{context}

QUESTION:
{query}

Answer:
"""
    return prompt

# -------------------------
# Generate with Gemini
# -------------------------
def generate_answer_with_gemini(prompt: str):
    resp = genai_client.models.generate_content(
        model="gemini-1.5-flash",   # or gemini-2.0-flash if available
        contents=prompt,
        config={"temperature": 0.2}
    )
    return resp.text.strip()

# -------------------------
# Full pipeline
# -------------------------
def answer_query(query: str):
    retrieved = retrieve(query)
    prompt = build_prompt(query, retrieved)
    answer = generate_answer_with_gemini(prompt)
    return {"answer": answer, "retrieved": retrieved}

# -------------------------
# CLI loop
# -------------------------
if __name__ == "__main__":
    print("Gemini Q&A. Type a question, or 'exit'.")
    while True:
        q = input("\nQ> ")
        if q.strip().lower() in ("exit", "quit"):
            break
        res = answer_query(q)
        print("\n--- ANSWER ---\n")
        print(res["answer"])
        print("\n--- SOURCES ---")
        for r in res["retrieved"]:
            print(f"{r['id']} (page {r['page']}, heading: {r['heading']}) score={r['score']:.3f}")


Gemini Q&A. Type a question, or 'exit'.

Q> what is adequate lighting?

--- ANSWER ---

Adequate lighting is lighting that is sufficient or comfortable for a person with normal vision to see fine detail [p61_c1, p61].

--- SOURCES ---
p61_c1 (page 61, heading: None) score=0.330
p62_c1 (page 62, heading: None) score=0.202
p60_c1 (page 60, heading: None) score=0.196
p59_c1 (page 59, heading: SECTION B: HEARING, SPEECH, AND VISION) score=0.166
p280_c1 (page 280, heading: of home care admission/resumption of care and immunization rates.) score=0.160


In [None]:
!pip install -q google-genai
