<a href="https://colab.research.google.com/github/siddhu1430/siddhu1430/blob/main/Testapp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
# 📌 Step 1: Install dependencies
!pip install gradio faiss-cpu sentence-transformers transformers python-docx

# 📌 Step 2: Import libraries
import os
import faiss
import gradio as gr
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from PyPDF2 import PdfReader
from docx import Document

# 📌 Step 3: Load Embedding Model + QA Model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")   # Embeddings
qa_model = pipeline("text2text-generation", model="google/flan-t5-small")  # QA Model

# 📌 Step 4: Document Loader
def load_documents(files):
    texts = []
    for file in files:
        if file.name.endswith(".txt"):
            with open(file.name, "r", encoding="utf-8") as f:
                texts.append(f.read())
        elif file.name.endswith(".pdf"):
            pdf = PdfReader(file.name)
            text = ""
            for page in pdf.pages:
                text += page.extract_text() + "\n"
            texts.append(text)
        elif file.name.endswith(".docx"):
            doc = Document(file.name)
            text = "\n".join([para.text for para in doc.paragraphs])
            texts.append(text)
    return texts

# 📌 Step 5: Build FAISS Vector Store
def build_faiss_index(texts):
    chunks = []
    for text in texts:
        for i in range(0, len(text), 500):  # split into ~500 char chunks
            chunks.append(text[i:i+500])

    embeddings = embed_model.encode(chunks)
    dim = embeddings.shape[1]

    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings, dtype="float32"))
    return index, chunks

# 📌 Step 6: Query Function (with sources)
chat_history = []  # store conversation globally

def answer_question(question, index, chunks):
    q_emb = embed_model.encode([question])
    D, I = index.search(np.array(q_emb, dtype="float32"), k=3)  # top 3
    retrieved = [chunks[i] for i in I[0]]

    context = " ".join(retrieved)
    prompt = f"Answer the question based only on the context:\nContext: {context}\n\nQuestion: {question}\nAnswer:"
    result = qa_model(prompt, max_length=200)[0]["generated_text"]

    # Save to history with sources
    sources = "\n\n🔎 Sources:\n" + "\n".join([f"- {r[:200]}..." for r in retrieved])
    full_answer = result + sources
    chat_history.append(f"Q: {question}\nA: {full_answer}\n{'-'*40}\n")
    return full_answer

# Globals
global_index, global_chunks = None, None

# 📌 Step 7: Gradio Interface
def upload_files(files):
    global global_index, global_chunks, chat_history
    texts = load_documents(files)
    global_index, global_chunks = build_faiss_index(texts)
    chat_history = []  # reset history when new docs uploaded
    return "✅ Documents uploaded and indexed! You can now ask questions."

def chat_fn(message, history):
    if global_index is None:
        return "⚠️ Please upload documents first."
    return answer_question(message, global_index, global_chunks)

# 📌 Step 8: Download Chat History
def download_history():
    file_path = "chat_history.txt"
    with open(file_path, "w", encoding="utf-8") as f:
        f.writelines(chat_history)
    return file_path

with gr.Blocks() as demo:
    gr.Markdown("## 🧠 Internal Docs Q&A Agent\n*Turning scattered docs into instant answers.*")

    with gr.Tab("📂 Upload Docs"):
        file_upload = gr.File(file_types=[".pdf", ".docx", ".txt"],
                              file_count="multiple",
                              type="filepath", # Changed 'file' to 'filepath'
                              label="Upload Documents")
        upload_button = gr.Button("Process Documents")
        output_status = gr.Textbox(label="Status")

    with gr.Tab("💬 Ask Questions"):
        chatbot = gr.ChatInterface(chat_fn, title="Ask Your Docs")
        download_btn = gr.Button("⬇️ Download Chat History")
        file_output = gr.File(label="Download Q&A Log")
        download_btn.click(fn=download_history, inputs=[], outputs=[file_output])

    upload_button.click(fn=upload_files, inputs=[file_upload], outputs=[output_status])

demo.launch()



Device set to use cpu
  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://11c9e37efb628ba320.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [44]:
# Install PyPDF2 in a separate cell
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/232.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
