In [1]:
# Install dependencies
!pip install gradio PyMuPDF sentence-transformers faiss-cpu numpy ibm-watsonx-ai python-dotenv

import fitz  # PyMuPDF
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import gradio as gr

# Load embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Storage for documents
doc_chunks = []
embeddings = None
index = None

# 1. PDF Upload + Processing
def process_pdfs(files):
    global doc_chunks, embeddings, index
    doc_chunks = []

    for f in files:
        with fitz.open(f.name) as pdf:
            text = ""
            for page in pdf:
                text += page.get_text("text") + "\n"
            # simple chunking
            words = text.split()
            for i in range(0, len(words), 400):
                chunk = " ".join(words[i:i+500])
                if chunk.strip():
                    doc_chunks.append(chunk)

    # Embed chunks
    embeddings = embed_model.encode(doc_chunks, convert_to_numpy=True)

    # Build FAISS index
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)

    return f"✅ {len(files)} PDF(s) processed with {len(doc_chunks)} chunks."

# 2. Question Answering
def answer_question(query):
    global doc_chunks, embeddings, index
    if index is None:
        return "⚠️ Please upload PDFs first."

    # Embed query
    q_emb = embed_model.encode([query], convert_to_numpy=True)

    # Search top 3 relevant chunks
    D, I = index.search(q_emb, k=3)
    context = "\n\n".join([doc_chunks[i] for i in I[0]])

    # ---- If you have IBM Watsonx credentials, replace this with real LLM call ----
    # For now, we simulate an "answer" by returning the most relevant context
    answer = f"📖 Based on your PDFs:\n\n{context}"
    return answer

# 3. Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# 📘 StudyMate: AI-Powered PDF Q&A")

    with gr.Row():
        file_upload = gr.File(file_types=[".pdf"], file_count="multiple")
        process_btn = gr.Button("Process PDFs")

    status = gr.Textbox(label="Status")

    with gr.Row():
        query = gr.Textbox(label="Ask a Question")
        ask_btn = gr.Button("Get Answer")

    answer_box = gr.Textbox(label="Answer", lines=10)

    process_btn.click(process_pdfs, inputs=[file_upload], outputs=[status])
    ask_btn.click(answer_question, inputs=[query], outputs=[answer_box])

demo.launch(share=True)

Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting ibm-watsonx-ai
  Downloading ibm_watsonx_ai-1.3.37-py3-none-any.whl.metadata (7.0 kB)
Collecting lomond (from ibm-watsonx-ai)
  Downloading lomond-0.3.3-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting ibm-cos-sdk<2.15.0,>=2.12.0 (from ibm-watsonx-ai)
  Downloading ibm_cos_sdk-2.14.3.tar.gz (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ibm-cos-sdk-core==2.14.3 (from ibm-cos-sdk<2.15.0,>=2.12.0->ibm-watsonx-ai)
  Downloading ibm_cos_sdk_core-2.14.3.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0f7fbecb17fb923751.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


