<a href="https://colab.research.google.com/github/soso-001/chatbot_q-a/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-cpu tiktoken python-docx
!pip install langchain-community
!pip install tools
!pip install -q PyMuPDF
!pip install --upgrade huggingface_hub langchain

In [None]:
import os
import gradio as gr
import requests
import tempfile

import fitz
from docx import Document as DocxDocument
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from google.colab import userdata

from langchain.llms import HuggingFaceHub
from langchain_community.llms import HuggingFaceEndpoint
from huggingface_hub import InferenceClient

from transformers import pipeline
from sentence_transformers import SentenceTransformer
from langchain.schema import Document

from langchain.embeddings.base import Embeddings

In [None]:
# Load OpenAI API key from environment variable
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAPI')
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HF_TOKEN')

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)


# Global QA chain
qa_chain = None

# 📦 Global storage
retriever = None

def prepare_qa_chain(text):
    global qa_chain
    global retriever

    # Step 1: Split document into chunks
    docs = text_splitter.create_documents([text])

    # Step 2: Create vector database
    db = FAISS.from_documents(docs, embedding_model)
    retriever = db.as_retriever()

    # Step 3: Load LLM and create RetrievalQA chain
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )

In [None]:
# 📄 Extract text from local file
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[-1].lower()
    if ext == ".txt":
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    elif ext == ".pdf":
        try:
            doc = fitz.open(file_path)
            # Explicitly load pages by index (correct order)
            texts = []
            for page_num in range(doc.page_count):
                page = doc.load_page(page_num)
                text = page.get_text("text")  # Use "text" for plain reading
                texts.append(text)
            return "\n".join(texts)
        except Exception as e:
            return f"❌ Error reading PDF: {e}"

    elif ext == ".docx":
        doc = DocxDocument(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    else:
        return None

# 📁 Handle local upload
def handle_local_file(file):
    try:
        file_path = file.name
        text = extract_text_from_file(file_path)
        if not text:
            return "❌ Unsupported or empty file."
        prepare_qa_chain(text)

        return "✅ File loaded and RAG chain is ready."

    except Exception as e:
        return f"❌ Error: {str(e)}"

# 🧠 Create FAISS retriever
def prepare_retriever(text):
    global retriever
    docs = text_splitter.create_documents([text])
    db = FAISS.from_documents(docs, embedding_model)
    retriever = db.as_retriever()
    return "✅ Document processed and indexed."

def answer_question(question):
    global qa_chain
    if not qa_chain:
        return "⚠️ Please load a document first."

    try:
        result = qa_chain({"query": question})
        return result["result"]
    except Exception as e:
        return f"❌ Error generating answer: {e}"


In [None]:
# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## 📚 Document Q&A Chatbot with RAG + GPT-4")
    gr.Markdown("Choose a document source, load it, then ask questions!")

    source_choice = gr.Radio(
        ["📁 Local File"],
        label="Choose document",
        value="📁 Local File",
        interactive=True
    )

    local_file_input = gr.File(label="Upload a file (PDF, DOCX, TXT)", file_types=[".pdf", ".txt"], visible=True)
    load_status = gr.Textbox(label="Load Status", interactive=False)

    question_box = gr.Textbox(label="Ask a question about the document")
    answer_box = gr.Textbox(label="Answer", interactive=False)


    def toggle_inputs(choice):
        return (
            gr.update(visible=(choice == "📁 Local File")),
        )

    source_choice.change(
        fn=toggle_inputs,
        inputs=source_choice,
        outputs=local_file_input
    )

    local_file_input.change(fn=handle_local_file, inputs=local_file_input, outputs=load_status)

    question_box.submit(fn=answer_question, inputs=question_box, outputs=answer_box)

demo.launch()