In [1]:
%pip install -U --force-reinstall "protobuf==4.25.3"


Collecting protobuf==4.25.3
  Using cached protobuf-4.25.3-cp310-abi3-win_amd64.whl.metadata (541 bytes)
Using cached protobuf-4.25.3-cp310-abi3-win_amd64.whl (413 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 5.29.5
    Uninstalling protobuf-5.29.5:
      Successfully uninstalled protobuf-5.29.5
Successfully installed protobuf-4.25.3
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-status 1.71.2 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 4.25.3 which is incompatible.


In [2]:
from __future__ import annotations

import os
from typing import List, Tuple
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-U",
                       "google-generativeai", "langchain-google-genai"])

import streamlit as st
from dotenv import load_dotenv
from PyPDF2 import PdfReader

import google.generativeai as genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate

ModuleNotFoundError: Module langchain_community.vectorstores not found. Please install langchain-community to access this module. You can install it using `pip install -U langchain-community`

In [None]:
# Configuration & Constants   #

DEFAULT_CHUNK_SIZE = 1200
DEFAULT_CHUNK_OVERLAP = 200
INDEX_DIR = "faiss_index"

load_dotenv()

def _read_google_api_key() -> str | None:
    key = os.getenv("GOOGLE_API_KEY")
    try:
        key = key or st.secrets.get("GOOGLE_API_KEY")
    except Exception:
        pass
    return key

In [None]:
API_KEY = _read_google_api_key()  # no Streamlit UI calls at import time

In [None]:
# Document Ingestion        

def extract_text_from_pdf(file) -> str:
    reader = PdfReader(file)
    parts: List[str] = []
    for page in reader.pages:
        txt = (page.extract_text() or "").replace("\x00", " ")
        parts.append(txt)
    return "\n".join(parts)

In [None]:
def ingest_files(files: List["UploadedFile"]) -> Tuple[List[str], List[dict]]:
    texts: List[str] = []
    metadatas: List[dict] = []
    chunk_size = st.session_state.get("chunk_size", DEFAULT_CHUNK_SIZE)
    chunk_overlap = st.session_state.get("chunk_overlap", DEFAULT_CHUNK_OVERLAP)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    for f in files:
        name = getattr(f, "name", "document")
        if name.lower().endswith(".pdf"):
            raw = extract_text_from_pdf(f)
        else:
            raw = f.read().decode(errors="ignore")

        for i, chunk in enumerate(splitter.split_text(raw)):
            texts.append(chunk)
            metadatas.append({"source": name, "chunk_id": i})

    return texts, metadatas

In [None]:
# Vector Store               

def build_index(texts: List[str], metadatas: List[dict]) -> None:
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vs = FAISS.from_texts(texts, embedding=embeddings, metadatas=metadatas)
    vs.save_local(INDEX_DIR)

def load_index() -> FAISS:
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return FAISS.load_local(INDEX_DIR, embeddings, allow_dangerous_deserialization=True)

In [None]:
# QA Chain                  

def make_qa_chain(temperature: float = 0.2):
    prompt = PromptTemplate(
        template=(
            "Answer ONLY with the information in the context.\n"
            "If the answer is not present, reply exactly:\n"
            "\"I don't have that in the provided documents.\"\n\n"
            "Context:\n{context}\n\n"
            "Question:\n{question}\n\n"
            "Answer:"
        ),
        input_variables=["context", "question"],
    )
    llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=temperature)
    return load_qa_chain(llm, chain_type="stuff", prompt=prompt)

def answer_query(query: str, top_k: int = 4, temperature: float = 0.2) -> Tuple[str, List[dict]]:
    if not os.path.isdir(INDEX_DIR):
        raise FileNotFoundError("No index found. Upload files and click 'Submit & Build Index' first.")
    vector_store = load_index()
    docs = vector_store.similarity_search(query, k=top_k)
    chain = make_qa_chain(temperature=temperature)
    result = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
    answer = result.get("output_text", "").strip()

    sources = []
    for d in docs:
        meta = {"source": d.metadata.get("source"), "chunk_id": d.metadata.get("chunk_id")}
        snippet = d.page_content[:300].replace("\n", " ")
        meta["snippet"] = snippet + ("..." if len(d.page_content) > 300 else "")
        sources.append(meta)

    return answer, sources

In [None]:
# Streamlit UI                #

def sidebar_controls():
    with st.sidebar:
        st.header("‚öôÔ∏è Settings")
        st.session_state["chunk_size"] = st.number_input(
            "Chunk size", min_value=200, max_value=8000, value=DEFAULT_CHUNK_SIZE, step=100
        )
        st.session_state["chunk_overlap"] = st.number_input(
            "Chunk overlap", min_value=0, max_value=4000, value=DEFAULT_CHUNK_OVERLAP, step=50
        )
        top_k = st.slider("Top-K retrieval", min_value=1, max_value=10, value=4)
        temperature = st.slider("Answer temperature", min_value=0.0, max_value=1.0, value=0.2, step=0.1)

        st.markdown("---")
        st.subheader("üìÅ Documents")
        with st.form("ingest"):
            uploads = st.file_uploader(
                "Upload PDF/TXT files and click **Submit & Build Index**",
                type=["pdf", "txt"], accept_multiple_files=True
            )
            submitted = st.form_submit_button("Submit & Build Index")
        if submitted:
            if not API_KEY:
                st.error("Missing GOOGLE_API_KEY. Set it in .env or Streamlit secrets before building the index.")
                st.stop()
            with st.spinner("Indexing‚Ä¶"):
                texts, metas = ingest_files(uploads or [])
                if not texts:
                    st.error("No text extracted. Check your files.")
                    st.stop()
                build_index(texts, metas)
                st.success(f"Index built with {len(texts)} chunks ‚úÖ")

        st.markdown("---")
        if st.button("üóëÔ∏è Clear local index"):
            import shutil
            if os.path.isdir(INDEX_DIR):
                shutil.rmtree(INDEX_DIR, ignore_errors=True)
                st.success("Cleared local index.")
            else:
                st.info("No index to clear.")

        return top_k, temperature

In [None]:
def init_page():
    # IMPORTANT: first Streamlit call
    st.set_page_config(page_title="Multi-Document Chat", page_icon="üìö", layout="centered")
    st.title("Multi-Document Chat üìöü§ñ")
    st.caption("Ask questions across PDFs/TXT with RAG (Gemini + FAISS).")

In [None]:
def main():
    init_page()

    # Configure Gemini AFTER set_page_config so we can safely show warnings/errors
    if API_KEY:
        genai.configure(api_key=API_KEY)
    else:
        st.warning("GOOGLE_API_KEY not found. Set it in a .env or Streamlit secrets to enable embeddings/LLM.")

    top_k, temperature = sidebar_controls()

    st.markdown("### üîé Ask a question")
    query = st.text_input("Type your question about the uploaded documents:")

    if query:
        if not API_KEY:
            st.error("Missing GOOGLE_API_KEY. Set it in .env or Streamlit secrets.")
            st.stop()
        try:
            answer, source_list = answer_query(query, top_k=top_k, temperature=temperature)
        except FileNotFoundError as e:
            st.warning(str(e))
            st.stop()
        except Exception as e:
            st.error(f"Something went wrong while answering: {e}")
            st.stop()

        st.markdown("#### üí¨ Answer")
        st.write(answer or "No answer returned.")

        with st.expander("üìé Sources"):
            for i, src in enumerate(source_list, start=1):
                st.markdown(
                    f"**{i}. {src.get('source','unknown')} ‚Äî chunk {src.get('chunk_id','?')}**\n\n"
                    f"{src.get('snippet','')}"
                )

    st.markdown(
        """
        <hr/>
        <div style="text-align:center; opacity:0.7;">
            ¬© Made by <a href="https://github.com/your-handle" target="_blank">you</a>.
        </div>
        """,
        unsafe_allow_html=True,
    )

if __name__ == "__main__":
    main()