<a href="https://colab.research.google.com/github/rajan-sharma-in/GDG/blob/main/college-projectipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1 — Install
# - Installs libraries for PDF parsing, embeddings, FAISS, UI, and Gemini (optional)
%pip install -q --upgrade sentence-transformers faiss-cpu gradio PyMuPDF google-generativeai
%pip install -q "numpy==2.0.2" "pandas==2.2.2"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.2/24.2 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.7/55.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Cell 2 — Imports
# - Centralizes dependencies
import os, json, re, time, textwrap, hashlib
from pathlib import Path

import numpy as np
import pandas as pd
import faiss
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer


In [4]:
# Cell 3 — PDF Upload + Folder Scan
# - Ensures /content/pdfs exists
# - Lists PDFs present
# - Helper to upload new PDFs into the folder
PDF_DIR = Path("/content/pdfs")
CACHE_DIR = Path("/content/index_cache")
PDF_DIR.mkdir(parents=True, exist_ok=True)
CACHE_DIR.mkdir(parents=True, exist_ok=True)

def list_pdfs():
    pdfs = sorted([p for p in PDF_DIR.glob("*.pdf")])
    if pdfs:
        print(f"Found {len(pdfs)} PDFs in {PDF_DIR}:")
        for p in pdfs:
            print(" -", p.name)
    else:
        print("No PDFs found. Use upload helper below.")
    return pdfs

def upload_pdfs():
    try:
        from google.colab import files
    except ImportError:
        print("Upload helper works only in Colab. Place PDFs manually into /content/pdfs.")
        return
    uploaded = files.upload()
    for name, data in uploaded.items():
        path = PDF_DIR / name
        with open(path, "wb") as f:
            f.write(data)
        print(f"Saved {name} -> {path}")
    list_pdfs()

list_pdfs()
print("Run upload_pdfs() to add more files.")


Found 1 PDFs in /content/pdfs:
 - 16_EBOOK-7th_ed_software_engineering_a_practitioners_approach_by_roger_s._pressman_.pdf
Run upload_pdfs() to add more files.


In [5]:
# Cell 4 — Extraction to DataFrame
# - Reads PDFs with PyMuPDF
# - Keeps per-page text + metadata; flags low-text pages
def extract_pdfs(pdf_paths):
    rows, low_text_pages = [], 0
    for pdf_path in pdf_paths:
        doc = fitz.open(pdf_path)
        for i, page in enumerate(doc, start=1):
            text = page.get_text("text") or ""
            low_text = len(text.strip()) < 30
            if low_text:
                low_text_pages += 1
            rows.append({
                "file_name": pdf_path.name,
                "page_number": i,
                "text": text,
                "low_text": low_text,
            })
    df_pages = pd.DataFrame(rows)
    print(f"Indexed pages: {len(df_pages)} | Low-text pages: {low_text_pages}")
    if low_text_pages > 0:
        print("Warning: Some pages look scanned/low-text. OCR not enabled in v1.")
    return df_pages

pdf_paths = list_pdfs()
pages_df = extract_pdfs(pdf_paths) if pdf_paths else pd.DataFrame(columns=["file_name","page_number","text","low_text"])
pages_df.head()


Found 1 PDFs in /content/pdfs:
 - 16_EBOOK-7th_ed_software_engineering_a_practitioners_approach_by_roger_s._pressman_.pdf
Indexed pages: 930 | Low-text pages: 11


Unnamed: 0,file_name,page_number,text,low_text
0,16_EBOOK-7th_ed_software_engineering_a_practit...,1,Software Engineering\nA Practitioner’s Approac...,False
1,16_EBOOK-7th_ed_software_engineering_a_practit...,2,Software Engineering\nA\nP R A C T I T I O N E...,False
2,16_EBOOK-7th_ed_software_engineering_a_practit...,3,,True
3,16_EBOOK-7th_ed_software_engineering_a_practit...,4,Software Engineering\nA\nP R A C T I T I O N E...,False
4,16_EBOOK-7th_ed_software_engineering_a_practit...,5,SOFTWARE ENGINEERING: A PRACTITIONER’S APPROAC...,False


In [6]:
# Cell 5 — Preprocess + Chunking
# - Cleans whitespace
# - Splits into overlapping character chunks
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 180
MAX_CHUNKS = 15000  # guardrail for very large corpora

def clean_text(t: str) -> str:
    return re.sub(r"\s+", " ", t).strip()

def make_chunks(df_pages, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    chunks = []
    for _, row in df_pages.iterrows():
        text = clean_text(row["text"])
        start = 0
        while start < len(text):
            end = start + chunk_size
            chunk_text = text[start:end]
            chunks.append({
                "chunk_id": f"{row.file_name}__p{row.page_number}__{start}",
                "file_name": row.file_name,
                "page_number": row.page_number,
                "chunk_text": chunk_text,
                "low_text": row.low_text,
            })
            start = end - overlap
    return pd.DataFrame(chunks)

if len(pages_df) == 0:
    chunks_df = pd.DataFrame(columns=["chunk_id","file_name","page_number","chunk_text","low_text"])
else:
    chunks_df = make_chunks(pages_df)
    if len(chunks_df) > MAX_CHUNKS:
        print(f"Too many chunks ({len(chunks_df)}). Consider fewer PDFs or larger chunk_size.")
    print(f"Chunks created: {len(chunks_df)} | Avg chars/chunk: {chunks_df['chunk_text'].str.len().mean():.1f}")

chunks_df.head()


Chunks created: 3502 | Avg chars/chunk: 846.2


Unnamed: 0,chunk_id,file_name,page_number,chunk_text,low_text
0,16_EBOOK-7th_ed_software_engineering_a_practit...,16_EBOOK-7th_ed_software_engineering_a_practit...,1,Software Engineering A Practitioner’s Approach...,False
1,16_EBOOK-7th_ed_software_engineering_a_practit...,16_EBOOK-7th_ed_software_engineering_a_practit...,2,Software Engineering A P R A C T I T I O N E R...,False
2,16_EBOOK-7th_ed_software_engineering_a_practit...,16_EBOOK-7th_ed_software_engineering_a_practit...,4,Software Engineering A P R A C T I T I O N E R...,False
3,16_EBOOK-7th_ed_software_engineering_a_practit...,16_EBOOK-7th_ed_software_engineering_a_practit...,5,SOFTWARE ENGINEERING: A PRACTITIONER’S APPROAC...,False
4,16_EBOOK-7th_ed_software_engineering_a_practit...,16_EBOOK-7th_ed_software_engineering_a_practit...,5,0 DOC/DOC 0 9 ISBN 978–0–07–337597–7 MHID 0–07...,False


In [7]:
# Cell 6 — Embeddings + FAISS Index
# - Loads MiniLM model
# - Builds/loads FAISS cosine index
# - Caches index + metadata to /content/index_cache
MODEL_NAME = "all-MiniLM-L6-v2"
INDEX_PATH = CACHE_DIR / "faiss.index"
META_PATH = CACHE_DIR / "chunks.parquet"
EMB_PATH = CACHE_DIR / "embeddings.npy"
MANIFEST_PATH = CACHE_DIR / "manifest.json"

model = SentenceTransformer(MODEL_NAME)

def current_manifest(pdf_paths):
    return {
        str(p.name): {"mtime": p.stat().st_mtime, "size": p.stat().st_size}
        for p in pdf_paths
    }

def load_cache(pdf_paths):
    if not (INDEX_PATH.exists() and META_PATH.exists() and EMB_PATH.exists() and MANIFEST_PATH.exists()):
        return None
    saved = json.load(open(MANIFEST_PATH))
    if saved != current_manifest(pdf_paths):
        return None
    try:
        df = pd.read_parquet(META_PATH)
        emb = np.load(EMB_PATH)
        idx = faiss.read_index(str(INDEX_PATH))
        print("Loaded cached index.")
        return df, emb, idx
    except Exception as e:
        print("Cache load failed:", e)
        return None

def save_cache(df, emb, idx, pdf_paths):
    df.to_parquet(META_PATH, index=False)
    np.save(EMB_PATH, emb)
    faiss.write_index(idx, str(INDEX_PATH))
    json.dump(current_manifest(pdf_paths), open(MANIFEST_PATH, "w"))
    print("Cache saved to /content/index_cache.")

def build_index(pdf_paths, force=False):
    if len(pdf_paths) == 0:
        print("No PDFs to index.")
        return None, None, None
    if not force:
        cached = load_cache(pdf_paths)
        if cached:
            return cached

    print("Encoding chunks...")
    emb = model.encode(
        chunks_df["chunk_text"].tolist(),
        convert_to_numpy=True,
        show_progress_bar=True
    ).astype("float32")
    faiss.normalize_L2(emb)

    dim = emb.shape[1]
    idx = faiss.IndexFlatIP(dim)
    idx.add(emb)
    save_cache(chunks_df, emb, idx, pdf_paths)
    return chunks_df, emb, idx

chunks_df, emb_matrix, index = build_index(pdf_paths, force=False)
if index:
    print(f"FAISS ready | dim: {emb_matrix.shape[1]} | vectors: {index.ntotal}")
    print(f"Indexed PDFs: {len(pdf_paths)}, Total pages: {len(pages_df)}, Total chunks: {len(chunks_df)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]



special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Encoding chunks...


Batches:   0%|          | 0/110 [00:00<?, ?it/s]

Cache saved to /content/index_cache.
FAISS ready | dim: 384 | vectors: 3502
Indexed PDFs: 1, Total pages: 930, Total chunks: 3502


In [8]:
# Cell 7 — Static Retrieval Demo
# - Simple top-K retrieval to verify index
def confidence_label(score: float) -> str:
    if score >= 0.65: return "High"
    if score >= 0.40: return "Medium"
    return "Low"

def preview(text, n=240):
    return (text[:n] + " ...") if len(text) > n else text

def retrieve_chunks(query: str, k: int = 3, doc_filter: str = "All"):
    if index is None:
        return {"error": "Index not built yet."}
    base_df = chunks_df if doc_filter == "All" else chunks_df[chunks_df.file_name == doc_filter]
    if len(base_df) == 0:
        return {"error": f"No chunks for '{doc_filter}'. Rebuild index or pick All."}

    q = clean_text(query)
    q_emb = model.encode([q], convert_to_numpy=True, show_progress_bar=False).astype("float32")
    faiss.normalize_L2(q_emb)

    # filter-aware search
    if doc_filter == "All":
        k_eff = min(k, len(base_df))
        scores, idxs = index.search(q_emb, k_eff)
        rows = base_df.iloc[idxs[0]]
    else:
        sub = base_df.reset_index(drop=True)
        sub_emb = emb_matrix[sub.index]  # reuse original row positions
        sub_index = faiss.IndexFlatIP(sub_emb.shape[1])
        sub_index.add(sub_emb)
        k_eff = min(k, len(sub))
        scores, idxs = sub_index.search(q_emb, k_eff)
        rows = sub.iloc[idxs[0]]

    items = []
    for r, row in enumerate(rows.itertuples()):
        score = float(scores[0][r])
        items.append({
            "score": score,
            "confidence": confidence_label(score),
            "file_name": row.file_name,
            "page_number": int(row.page_number),
            "chunk_text": row.chunk_text,
            "preview": preview(row.chunk_text),
        })
    return {"error": None, "items": items, "top_score": float(scores[0][0]) if len(items) else 0.0}

# quick smoke test (edit query as needed)
demo = retrieve_chunks("what is this about?", k=3)
demo["items"][:1] if demo["error"] is None else demo


[{'score': 0.3294261395931244,
  'confidence': 'Low',
  'file_name': '16_EBOOK-7th_ed_software_engineering_a_practitioners_approach_by_roger_s._pressman_.pdf',
  'page_number': 29,
  'chunk_text': 'edition of “the book.” Roger S. Pressman xxviii PREFACE Osman Balci, Virginia Tech University Max Fomitchev, Penn State University Jerry (Zeyu) Gao, San Jose State University Guillermo Garcia, Universidad Alfonso X Madrid Pablo Gervas, Universidad Complutense de Madrid SK Jain, National Institute of Technology Hamirpur Saeed Monemi, Cal Poly Pomona Ahmed Salem, California State University Vasudeva Varma, IIIT Hyderabad Appendix 1. Their assistance and comments were invaluable. Special thanks also go to Bruce Maxim of the University of Michigan–Dearborn, who assisted me in developing much of the pedagogical website content that accompanies this book. Finally, I wish to thank the reviewers of the seventh edition: Their in-depth comments and thoughtful criticism have been invaluable.',
  'previ

In [21]:
# Cell 8 — Gemini Setup (Optional)
# - Pulls GOOGLE_API_KEY from env or Colab userdata
# - If missing, RAG falls back to deterministic answers
GEMINI_KEY = os.environ.get("GOOGLE_API_KEY")
if not GEMINI_KEY:
    try:
        from google.colab import userdata
        GEMINI_KEY = userdata.get("GOOGLE_API_KEY")
    except Exception:
        GEMINI_KEY = None

GEMINI_READY = False
gemini_model = None
if GEMINI_KEY:
    import google.generativeai as genai
    genai.configure(api_key=GEMINI_KEY)
    try:
        # Try to use gemini-1.5-flash if available, otherwise list models
        gemini_model = genai.GenerativeModel("gemini-3-flash-preview")
        GEMINI_READY = True
        print("Gemini ready with gemini-3-flash-preview.")
    except Exception as e:
        print(f"gemini-3-flash-preview not available: {e}")
        print("Listing available models...")
        for m in genai.list_models():
            if "generateContent" in m.supported_generation_methods:
                print(m.name)
        print("Please update the model name in Cell 8 to one of the available models if gemini-3-flash-preview is not listed.")
else:
    print("No GOOGLE_API_KEY found. Using deterministic fallback.")

Gemini ready with gemini-3-flash-preview.


In [28]:
# Cell 9 — Static RAG Demo
# - Builds a grounded prompt
# - Uses Gemini if available; otherwise deterministic summary
def grounding_prompt(question, retrieved_items):
    context_lines = []
    for it in retrieved_items:
        cite = f"[{it['file_name']} p.{it['page_number']}]"
        context_lines.append(f"{cite} {it['preview']}")
    context_block = "\n".join(context_lines)
    prompt = f"""
You are an AI study buddy. Answer ONLY using the context below.
If the context is insufficient, say you don't know and ask for a clearer question.
Cite sources like [file.pdf p.12].

Question: {question}

Context:
{context_block}
""".strip()
    return prompt

def rag_answer(question, retrieved_items, use_gemini=True):
    if not retrieved_items:
        return "No context found. Add PDFs or rebuild index.", ""
    prompt = grounding_prompt(question, retrieved_items)
    if use_gemini and GEMINI_READY:
        try:
            resp = gemini_model.generate_content(prompt)
            text = resp.text.strip()
            return text, prompt
        except Exception as e:
            print("Gemini error, using fallback:", e)

    # deterministic fallback: concatenate grounded snippets
    lines = ["(Gemini unavailable) Best matching passages:"]
    for it in retrieved_items:
        cite = f"[{it['file_name']} p.{it['page_number']}]"
        lines.append(f"{cite} {it['preview']}")
    return "\n".join(lines), prompt

# quick RAG test (edit question and k as needed)
sample = retrieve_chunks("give me all chapter names", k=2)
if sample["error"] is None:
    ans, used_prompt = rag_answer("give me a summary", sample["items"], use_gemini=True)
    print(ans)
else:
    print(sample["error"])
    print("\nNote: The above 'Gemini error' occurred because Cell 8 (Gemini Setup) was not yet configured for the correct model at that time. Please re-run Cell 9 now that Cell 8 has been successfully updated and executed.")

The provided text explains that candidate classes can be identified by looking for descriptive nouns within a use-case script, a topic covered in more detail in Chapter 8 [16_EBOOK-7th_ed_software_engineering_a_practitioners_approach_by_roger_s._pressman_.pdf p.168]. Additionally, the text mentions a section titled "FIVE ADVANCED TOPICS" [16_EBOOK-7th_ed_software_engineering_a_practitioners_approach_by_roger_s._pressman_.pdf p.861].


In [23]:
# Cell 10 — Pipeline functions (retrieve + rag + rebuild)
# - Rebuilds index on demand
# - Handles doc filters, top-K, and confidence
def rebuild_index(force=False):
    global pdf_paths, pages_df, chunks_df, emb_matrix, index
    pdf_paths = list_pdfs()
    if len(pdf_paths) == 0:
        return "No PDFs found. Upload files first."
    pages_df = extract_pdfs(pdf_paths)
    chunks_df = make_chunks(pages_df)
    if len(chunks_df) > MAX_CHUNKS:
        return f"Too many chunks ({len(chunks_df)}). Use fewer PDFs."
    chunks_df, emb_matrix, index = build_index(pdf_paths, force=force)
    if index:
        return f"Rebuilt index. PDFs: {len(pdf_paths)}, pages: {len(pages_df)}, chunks: {len(chunks_df)}"
    return "Index rebuild failed."

def pipeline(question, top_k=3, doc_filter="All", use_gemini=False):
    if not question.strip():
        return "Please enter a question.", "", "", ""
    res = retrieve_chunks(question, k=int(top_k), doc_filter=doc_filter)
    if res["error"]:
        return res["error"], "", "", ""
    items = res["items"]
    top = items[0]
    conf = f"{confidence_label(top['score'])} (top score {top['score']:.3f})"
    answer_text, prompt = rag_answer(question, items, use_gemini=use_gemini)
    # transparency block
    lines = []
    for it in items:
        cite = f"[{it['file_name']} p.{it['page_number']}]"
        lines.append(f"Score: {it['score']:.3f} | {it['confidence']} | {cite}\n{it['preview']}")
    transparency = "\n\n".join(lines)
    return answer_text, conf, transparency, prompt

print("Call pipeline(question, top_k, doc_filter, use_gemini) or rebuild_index(force=True).")


Call pipeline(question, top_k, doc_filter, use_gemini) or rebuild_index(force=True).


In [24]:
# Cell 11 — Gradio UI
# - End-to-end interface with rebuild button
import gradio as gr

doc_choices = ["All"] + [p.name for p in list_pdfs()]

def ui_query(question, doc_filter, top_k, use_gemini):
    return pipeline(question, top_k=top_k, doc_filter=doc_filter, use_gemini=use_gemini)

def ui_rebuild():
    msg = rebuild_index(force=True)
    new_choices = ["All"] + [p.name for p in list_pdfs()]
    return gr.Dropdown.update(choices=new_choices, value="All"), msg

with gr.Blocks(title="AI Study Buddy for PDFs") as demo:
    gr.Markdown("### AI Study Buddy for PDFs\nUpload textbooks/papers, rebuild the index, then ask grounded questions.")
    with gr.Row():
        question = gr.Textbox(label="Question", placeholder="Ask about your PDFs...", lines=2)
    with gr.Row():
        doc_filter = gr.Dropdown(label="Document filter", choices=doc_choices, value="All")
        top_k = gr.Slider(1, 8, value=3, step=1, label="Top K passages")
        use_gemini = gr.Checkbox(label="Answer with Gemini (if available)", value=False)
    rebuild_btn = gr.Button("Rebuild Index")
    status_box = gr.Textbox(label="Index status", value="Ready" if index else "Build pending")

    answer = gr.Textbox(label="Final Answer", lines=6)
    confidence = gr.Textbox(label="Confidence", lines=1)
    transparency = gr.Textbox(label="Retrieval Transparency (scores + citations)", lines=12)
    grounding = gr.Textbox(label="Grounding Prompt", lines=10)

    rebuild_btn.click(fn=ui_rebuild, outputs=[doc_filter, status_box])
    question.submit(fn=ui_query, inputs=[question, doc_filter, top_k, use_gemini], outputs=[answer, confidence, transparency, grounding])
    gr.Button("Ask").click(fn=ui_query, inputs=[question, doc_filter, top_k, use_gemini], outputs=[answer, confidence, transparency, grounding])

demo.launch()


Found 1 PDFs in /content/pdfs:
 - 16_EBOOK-7th_ed_software_engineering_a_practitioners_approach_by_roger_s._pressman_.pdf
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d2717b6c51251b5afb.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


