In [None]:
%pip install PyMuPDF pytesseract faiss-cpu sentence-transformers
%pip install gradio torch
%pip install google-generativeai
%pip install langchain-text-splitters
%pip install nest_asyncio

Collecting PyMuPDF
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m114.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytesseract, PyMuPDF, faiss-cpu
Successfully installed PyMuPDF-1.26.6 faiss-cpu-1.12.0 pytesseract-0.3.13


In [None]:
import os, gc, io, time
import fitz
import pytesseract
from PIL import Image
import numpy as np
import gradio as gr
import google.generativeai as genai
from google.colab import userdata
from sentence_transformers import SentenceTransformer
import faiss
from langchain_text_splitters import RecursiveCharacterTextSplitter

import nest_asyncio
nest_asyncio.apply()

# Configure Gemini API
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
    # Use the stable model
    gemini_model = genai.GenerativeModel('gemini-2.0-flash')
    print("Gemini API Configured.")
except Exception as e:
    print(f"Error configuring API: {e}")
    gemini_model = None

# Load Embedding Model
print("Loading embedding model...")
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("Embedding model loaded.")

# Advanced Processing Functions
def extract_text_from_pdf(pdf_path):
    """Extracts text with OCR fallback."""
    doc = fitz.open(pdf_path)
    full_text = ""
    print(f"Extracting text from {doc.page_count} pages...")

    for i, page in enumerate(doc):
        text = page.get_text()
        if not text.strip():
            # OCR Fallback
            try:
                pix = page.get_pixmap(dpi=150)
                img = Image.open(io.BytesIO(pix.tobytes("png")))
                text = pytesseract.image_to_string(img)
            except:
                text = ""
        full_text += f"\n--- PAGE {i+1} ---\n{text}"

    return full_text

def smart_chunking(text):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = splitter.split_text(text)
    return [{"text": chunk, "id": i} for i, chunk in enumerate(chunks)]

def build_vector_store(chunks):
    texts = [c["text"] for c in chunks]
    embeddings = embed_model.encode(texts, convert_to_numpy=True)
    faiss.normalize_L2(embeddings)

    index = faiss.IndexFlatIP(embeddings.shape[1])
    index.add(embeddings)
    return index, chunks

# Streaming Chat Logic
def process_file(file):
    if not file:
        return "Please upload a file.", None

    text = extract_text_from_pdf(file.name)
    chunks = smart_chunking(text)
    index, chunk_data = build_vector_store(chunks)

    # Generate Summary
    prompt = f"Summarize this document in 5 bullet points:\n\n{text[:50000]}"
    try:
        summary_stream = gemini_model.generate_content(prompt, stream=True)
        summary_text = ""
        for chunk in summary_stream:
            summary_text += chunk.text
    except Exception as e:
        summary_text = f"Error generating summary: {e}"

    # Return summary and the State (Index + Data)
    return summary_text, (index, chunk_data)

def chat_response(message, history, pdf_state):
    if not pdf_state:
        yield "Please upload and process a PDF first."
        return

    index, chunks = pdf_state

    # 1. Retrieve Context
    q_emb = embed_model.encode([message], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k=5)

    context_text = "\n\n".join([chunks[i]["text"] for i in I[0]])

    # 2. Format History for Gemini (Memory)
    history_context = ""
    for user_msg, bot_msg in history:
        history_context += f"User: {user_msg}\nAssistant: {bot_msg}\n"

    # 3. Create Prompt
    system_prompt = f"""
    You are a helpful PDF assistant. Use the context below to answer the user's question.
    If the answer is not in the context, say you don't know.

    CONTEXT FROM PDF:
    {context_text}

    CONVERSATION HISTORY:
    {history_context}

    CURRENT QUESTION:
    {message}
    """

    # 4. Stream Response
    try:
        response_stream = gemini_model.generate_content(system_prompt, stream=True)
        partial_text = ""
        for chunk in response_stream:
            partial_text += chunk.text
            yield partial_text
            time.sleep(0.05)

    except Exception as e:
        if "429" in str(e):
            yield "⚠️ Rate limit exceeded. Please wait 30 seconds."
        else:
            yield f"Error: {str(e)}"

# UI (ChatInterface)
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## AI PDF Analyst")

    with gr.Row():
        with gr.Column(scale=1):
            pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
            process_btn = gr.Button("Process Document", variant="primary")
            summary_box = gr.Textbox(label="Document Summary", lines=8)

            pdf_state = gr.State()

        with gr.Column(scale=2):
            chatbot = gr.ChatInterface(
                fn=chat_response,
                additional_inputs=[pdf_state],
                examples=[
                    ["What is the main conclusion?"],
                    ["Summarize page 1"],
                    ["Explain the methodology"]
                ],
                title="Chat with your PDF"
            )

    # Connect the Process Button
    process_btn.click(
        fn=process_file,
        inputs=[pdf_input],
        outputs=[summary_box, pdf_state]
    )

print("Launching App...")
demo.launch(share=True, debug=True)

✅ Gemini API Configured.
⏳ Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embedding model loaded.


  self.chatbot = Chatbot(


Launching App...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://37785a56cfd14a211a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Extracting text from 3 pages...
