In [None]:
# Install required packages first:
!pip install gradio pypdf2 sentence-transformers chromadb groq langchain-text-splitters

import gradio as gr
import os
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import chromadb
from groq import Groq
from langchain_text_splitters import RecursiveCharacterTextSplitter

class RAGPDFReader:
    def __init__(self, groq_api_key):
        """Initialize RAG system with Groq API and embedding model"""
        self.groq_client = Groq(api_key=groq_api_key)
        self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        self.chroma_client = chromadb.Client()
        self.collection = None
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            separators=["\n\n", "\n", ". ", " ", ""]
        )

    def extract_text_from_pdf(self, pdf_file):
        """Extract text from uploaded PDF"""
        try:
            reader = PdfReader(pdf_file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
        except Exception as e:
            return f"Error extracting PDF: {str(e)}"

    def process_pdf(self, pdf_file):
        """Process PDF: extract text, chunk, embed, and store in vector DB"""
        if pdf_file is None:
            return "Please upload a PDF file first."

        # Extract text
        full_text = self.extract_text_from_pdf(pdf_file)

        if full_text.startswith("Error"):
            return full_text

        # Split into chunks
        chunks = self.text_splitter.split_text(full_text)

        if not chunks:
            return "No text found in PDF."

        # Create embeddings
        embeddings = self.embedding_model.encode(chunks).tolist()

        # Create or reset collection
        try:
            self.chroma_client.delete_collection("pdf_collection")
        except:
            pass

        self.collection = self.chroma_client.create_collection("pdf_collection")

        # Store chunks with embeddings
        self.collection.add(
            embeddings=embeddings,
            documents=chunks,
            ids=[f"chunk_{i}" for i in range(len(chunks))]
        )

        return f"âœ“ PDF processed successfully!\nâ€¢ Total chunks: {len(chunks)}\nâ€¢ Embeddings created and stored in vector database."

    def query_pdf(self, question):
        """Query the PDF using RAG pipeline"""
        if self.collection is None:
            return "Please upload and process a PDF first."

        if not question.strip():
            return "Please enter a question."

        # Embed the question
        question_embedding = self.embedding_model.encode([question]).tolist()

        # Retrieve relevant chunks
        results = self.collection.query(
            query_embeddings=question_embedding,
            n_results=3
        )

        # Get context from retrieved chunks
        context = "\n\n".join(results['documents'][0])

        # Create prompt for Groq
        system_prompt = """You are a precise PDF document assistant. Answer questions using ONLY the extracted chunks from the uploaded PDF below.\n\nCRITICAL INSTRUCTIONS:\n- Use ONLY the Context provided. No outside knowledge, assumptions, or inventions.\n- If information is absent from Context, respond: "Not found in uploaded PDF."\n- Limit to 2-3 sentences. Quote key phrases directly from context.\n- Focus on accuracy over completeness."""

        user_prompt = f"""Context: {context}\n\nQuestion: {question}\n\nAnswer:"""

        # Query Groq API
        try:
            chat_completion = self.groq_client.chat.completions.create(
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                model="llama-3.3-70b-versatile",
                temperature=0.1,
                max_tokens=500
            )

            answer = chat_completion.choices[0].message.content
            return answer

        except Exception as e:
            return f"Error querying Groq API: {str(e)}"

# Initialize RAG system
def create_interface():
    """Create Gradio interface"""

    # Get API key from environment or user input
    default_api_key = os.getenv("GROQ_API_KEY", "")

    def initialize_system(api_key):
        global rag_system
        if not api_key:
            return "Please provide a Groq API key."
        try:
            rag_system = RAGPDFReader(api_key)
            return "âœ“ System initialized successfully!"
        except Exception as e:
            return f"Error initializing: {str(e)}"

    def process_wrapper(pdf_file):
        if 'rag_system' not in globals():
            return "Please initialize the system with your API key first."
        return rag_system.process_pdf(pdf_file)

    def query_wrapper(question):
        if 'rag_system' not in globals():
            return "Please initialize the system with your API key first."
        return rag_system.query_pdf(question)

    # Create Gradio interface
    with gr.Blocks(theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
            # ðŸ“„ RAG PDF Reader with Groq API
            Upload a PDF, process it with embeddings, and ask questions using RAG pipeline.
            """
        )

        with gr.Row():
            api_key_input = gr.Textbox(
                label="Groq API Key",
                type="password",
                value=default_api_key,
                placeholder="Enter your Groq API key"
            )
            init_btn = gr.Button("Initialize System", variant="primary")

        init_status = gr.Textbox(label="System Status", interactive=False)

        gr.Markdown("---")

        with gr.Row():
            with gr.Column():
                pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
                process_btn = gr.Button("Process PDF", variant="primary")
                process_status = gr.Textbox(label="Processing Status", interactive=False)

            with gr.Column():
                question_input = gr.Textbox(
                    label="Ask a Question",
                    placeholder="What would you like to know about the document?",
                    lines=3
                )
                query_btn = gr.Button("Get Answer", variant="primary")
                answer_output = gr.Textbox(label="Answer", lines=8, interactive=False)

        gr.Markdown(
            """
            ### How it works:
            1. Enter your Groq API key and initialize the system
            2. Upload a PDF document
            3. Click "Process PDF" to chunk, embed, and store in vector database
            4. Ask questions about the document content
            5. Get accurate answers based only on the PDF content
            """
        )

        # Event handlers
        init_btn.click(initialize_system, inputs=[api_key_input], outputs=[init_status])
        process_btn.click(process_wrapper, inputs=[pdf_input], outputs=[process_status])
        query_btn.click(query_wrapper, inputs=[question_input], outputs=[answer_output])
        question_input.submit(query_wrapper, inputs=[question_input], outputs=[answer_output])

    return demo

# Launch the app
if __name__ == "__main__":
    demo = create_interface()
    demo.launch(share=True)



  with gr.Blocks(theme=gr.themes.Soft()) as demo:


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://51acafed6dec2802be.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
