In [4]:
!pip install -q langchain langchain-community langchain-text-splitters
!pip install -q langchain-chroma chromadb
!pip install -q langchain-huggingface sentence-transformers
!pip install -q langchain-groq
!pip install -q "unstructured[pdf]" pypdf
!pip install -q gradio
!pip install -q langchain

In [5]:
import os
from pathlib import Path
from typing import Optional, Tuple
import warnings
warnings.filterwarnings('ignore')


from langchain_community.document_loaders import UnstructuredFileLoader, PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain_classic.chains.retrieval_qa.base import RetrievalQA


import gradio as gr

VECTOR_DB_PATH = "/tmp/vector_db"
LOCAL_PDF_PATH = "/tmp/uploaded.pdf"
CHUNK_SIZE = 100
CHUNK_OVERLAP = 50
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL = "llama-3.3-70b-versatile"
LLM_TEMPERATURE = 0.2


os.makedirs("/tmp", exist_ok=True)
os.makedirs(VECTOR_DB_PATH, exist_ok=True)


qa_chain = None
initialization_complete = False

In [6]:
from getpass import getpass

GROQ_API_KEY = getpass("üîë Enter your Groq API Key: ")
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

print("‚úÖ API Key configured successfully!")

üîë Enter your Groq API Key: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
‚úÖ API Key configured successfully!


In [7]:
def init_rag_pipeline(pdf_path: str) -> str:
    """
    Initialize the complete RAG system from a PDF file.

    Parameters:
        pdf_path: Path to the uploaded PDF file

    Returns:
        Status message indicating success or failure
    """
    global qa_chain, initialization_complete


    initialization_complete = False
    qa_chain = None

    try:
        print("üì• Starting RAG Pipeline Initialization...")


        if not pdf_path or not os.path.exists(pdf_path):
            return "‚ùå Error: Invalid PDF file path"


        print("üìÑ Loading PDF document...")
        try:
            loader = UnstructuredFileLoader(pdf_path)
            documents = loader.load()
        except Exception as e:
            print(f"‚ö†Ô∏è UnstructuredFileLoader failed: {e}")
            print("üîÑ Trying PyPDFLoader as fallback...")
            loader = PyPDFLoader(pdf_path)
            documents = loader.load()

        if not documents:
            return "‚ùå Error: No content extracted from PDF"

        print(f"‚úì Document loaded: {len(documents)} pages")


        print("‚úÇÔ∏è Splitting document into chunks...")
        text_splitter = CharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP
        )
        texts = text_splitter.split_documents(documents)

        if not texts:
            return "‚ùå Error: No text chunks created"

        print(f"‚úì Document split into {len(texts)} chunks")


        print("üî¢ Creating embeddings (this may take a minute)...")
        embeddings = HuggingFaceEmbeddings(
            model_name=EMBEDDING_MODEL
        )
        print("‚úì Embedding model loaded")


        print("üíæ Creating vector database...")
        vectordb = Chroma.from_documents(
            documents=texts,
            embedding=embeddings,
            persist_directory=VECTOR_DB_PATH
        )
        print("‚úì Vector database created and persisted")


        print("üîç Initializing retriever...")
        retriever = vectordb.as_retriever(
            search_kwargs={"k": 3}
        )
        print("‚úì Retriever ready")


        print("ü§ñ Initializing LLM...")
        llm = ChatGroq(
            model=LLM_MODEL,
            temperature=LLM_TEMPERATURE
        )
        print("‚úì LLM initialized")


        print("‚õìÔ∏è Building RAG chain...")
        qa_chain = RetrievalQA.from_chain_type(
            llm=llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True
        )

        initialization_complete = True
        print("‚úì RAG Pipeline initialized successfully!")
        return "‚úÖ RAG Pipeline Ready - You can now ask questions!"

    except Exception as e:
        error_msg = f"‚ùå Error during initialization: {str(e)}"
        print(error_msg)
        return error_msg


def process_query(query: str) -> Tuple[str, str]:
    """
    Process user query through the RAG pipeline.

    Parameters:
        query: The user's question about the document

    Returns:
        Tuple of (answer, formatted sources)
    """
    global qa_chain


    if not query or not query.strip():
        return "‚ö†Ô∏è Please enter a valid question.", ""


    if qa_chain is None:
        return "‚ùå RAG pipeline not initialized. Please process a document first.", ""

    try:
        print(f"\nüîç Processing query: {query}")


        response = qa_chain.invoke({"query": query})


        answer = response.get("result", "No answer generated")


        sources = response.get("source_documents", [])
        sources_text = format_sources(sources)

        print("‚úì Query processed successfully")
        return answer, sources_text

    except Exception as e:
        error_msg = f"‚ùå Error processing query: {str(e)}"
        print(error_msg)
        return error_msg, ""


def format_sources(sources: list) -> str:
    """
    Format retrieved source documents for display.

    Parameters:
        sources: List of Document objects from retriever

    Returns:
        Markdown-formatted string with source citations
    """
    if not sources:
        return "No sources retrieved"

    sources_md = "### üìÑ Retrieved Sources\n\n"

    for idx, source in enumerate(sources, 1):

        content_preview = source.page_content[:300].replace("\n", " ")


        page_info = source.metadata.get('page', 'N/A') if source.metadata else 'N/A'


        sources_md += f"**[Source {idx}]** (Page {page_info})\n"
        sources_md += f"> {content_preview}...\n\n"

    return sources_md


print("‚úÖ RAG Pipeline functions defined!")

‚úÖ RAG Pipeline functions defined!


In [13]:
CUSTOM_CSS = """
.gradio-container {
    max-width: 1400px;
    margin: 0 auto;
    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
}

.header-section {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 2.5rem;
    border-radius: 16px;
    margin-bottom: 2rem;
    box-shadow: 0 8px 24px rgba(102, 126, 234, 0.25);
}

.header-section h1 {
    margin: 0;
    font-size: 2.8rem;
    font-weight: 700;
    color: white !important;
}

.header-section p {
    margin: 0.75rem 0 0 0;
    font-size: 1.1rem;
    opacity: 0.95;
    color: white !important;
}

.developer-credit {
    background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
    color: white !important;
    padding: 0.75rem 1.5rem;
    border-radius: 50px;
    font-weight: 600;
    font-size: 1rem;
    display: inline-block;
    margin-top: 1rem;
    box-shadow: 0 4px 15px rgba(245, 87, 108, 0.4);
    animation: glow 2s ease-in-out infinite alternate;
}

@keyframes glow {
    from { box-shadow: 0 4px 15px rgba(245, 87, 108, 0.4); }
    to { box-shadow: 0 4px 25px rgba(240, 147, 251, 0.6); }
}

/* STATUS BOX - Dark text on light blue background */
.status-box {
    background: #1e3a5f !important;
    border-left: 4px solid #60a5fa;
    padding: 1.5rem;
    border-radius: 8px;
    margin: 1rem 0;
    color: #e0f2fe !important;
}

.status-box * {
    color: #e0f2fe !important;
}

.status-box strong, .status-box b {
    color: #93c5fd !important;
}

/* ANSWER BOX - White/Light text on dark green background */
.answer-box {
    background: #064e3b !important;
    border-left: 4px solid #34d399;
    padding: 2rem;
    border-radius: 8px;
    line-height: 1.8;
    margin: 1rem 0;
    color: #d1fae5 !important;
}

.answer-box * {
    color: #d1fae5 !important;
}

.answer-box strong, .answer-box b {
    color: #6ee7b7 !important;
    font-weight: 700 !important;
}

.answer-box em, .answer-box i {
    color: #a7f3d0 !important;
}

.answer-box code {
    background: #047857 !important;
    color: #ecfdf5 !important;
    padding: 0.2rem 0.4rem;
    border-radius: 4px;
}

.answer-box a {
    color: #5eead4 !important;
    text-decoration: underline;
}

.answer-box h1, .answer-box h2, .answer-box h3, .answer-box h4 {
    color: #6ee7b7 !important;
}

.answer-box ul, .answer-box ol, .answer-box li {
    color: #d1fae5 !important;
}

/* INFO BOX - Light text on dark purple background */
.info-box {
    background: #3b0764 !important;
    padding: 1.5rem;
    border-radius: 10px;
    border: 1px solid #a855f7;
    margin: 1rem 0;
    color: #f3e8ff !important;
}

.info-box h4 {
    color: #e9d5ff !important;
    font-weight: 700 !important;
}

.info-box p {
    color: #e9d5ff !important;
}

.info-box b {
    color: #c084fc !important;
}

button.primary {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
    color: white !important;
    border: none !important;
    padding: 0.875rem 2rem !important;
    font-weight: 600 !important;
    border-radius: 10px !important;
    box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3) !important;
}

button.primary:hover {
    transform: translateY(-2px) !important;
    box-shadow: 0 6px 20px rgba(102, 126, 234, 0.4) !important;
}

.footer-credit {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white !important;
    padding: 1rem 2rem;
    border-radius: 10px;
    font-weight: 600;
    font-size: 1.1rem;
    display: inline-block;
    margin-top: 0.5rem;
}
"""

def create_interface():
    """Create the complete Gradio interface."""

    with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft(), title="AskMyDocs") as demo:


        gr.HTML("""
        <div class='header-section'>
            <h1>üìö AskMyDocs</h1>
            <p>Upload any PDF document and get instant answers powered by AI</p>
            <p style='font-size: 0.9rem; margin-top: 0.5rem; opacity: 0.85;'>
                Powered by RAG (Retrieval-Augmented Generation)
            </p>
            <div class='developer-credit'>üë®‚Äçüíª Developed by Shivam Kumar Yadav</div>
        </div>
        """)

        with gr.Row():

            with gr.Column(scale=1):
                gr.Markdown("### üì§ Upload Document")

                pdf_upload = gr.File(
                    label="Select PDF File",
                    file_types=[".pdf"],
                    type="filepath"
                )

                process_btn = gr.Button(
                    "üîÑ Process Document",
                    variant="primary",
                    size="lg"
                )

                status_box = gr.Markdown(
                    "**Status:** ‚è≥ Waiting for document upload...",
                    elem_classes="status-box"
                )

                gr.Markdown("---")

                gr.Markdown("""
                ### üí° How to Use

                **Step 1:** Upload your PDF file

                **Step 2:** Click "Process Document"

                **Step 3:** Ask questions about your document

                **Step 4:** Get AI-powered answers with sources
                """)

                gr.Markdown("---")

                gr.HTML("""
                <div class='info-box'>
                    <h4 style='margin-top: 0;'>‚öôÔ∏è Configuration</h4>
                    <p style='font-size: 0.9rem; margin: 0.5rem 0;'>
                        <b>LLM:</b> Llama 3.3 70B (Groq)<br>
                        <b>Embeddings:</b> MiniLM-L6-v2<br>
                        <b>Vector DB:</b> ChromaDB<br>
                        <b>Chunks:</b> 100 chars, 50 overlap
                    </p>
                </div>
                """)


            with gr.Column(scale=2):
                gr.Markdown("### ‚ùì Ask Your Question")

                query_input = gr.Textbox(
                    placeholder="Example: What are the main topics covered in this document?",
                    lines=4,
                    label="Your Question",
                    show_label=False
                )

                gr.Markdown("**üìå Quick Examples:**")
                gr.Examples(
                    examples=[
                        "What are the main topics covered in this document?",
                        "Summarize the key findings or conclusions",
                        "What methodology or approach is described?",
                        "Explain the most important concepts",
                        "What are the main arguments or points made?"
                    ],
                    inputs=query_input,
                    label=None
                )

                submit_btn = gr.Button(
                    "üîç Get Answer",
                    variant="primary",
                    size="lg"
                )

                gr.Markdown("---")
                gr.Markdown("### üí¨ AI Response")

                answer_output = gr.Markdown(
                    "*Your answer will appear here...*",
                    elem_classes="answer-box"
                )

                gr.Markdown("---")
                gr.Markdown("### üìö Source References")

                sources_output = gr.Markdown(
                    "*Source citations will appear here...*"
                )


        gr.HTML("""
        <div style='text-align: center; margin-top: 3rem; padding: 2rem;
                    border-top: 2px solid #e5e7eb;'>
            <p style='margin: 0; font-size: 0.95rem; color: #374151;'>
                <strong>AskMyDocs</strong> - Intelligent Document Search System
            </p>
            <p style='margin: 0.5rem 0 1rem 0; font-size: 0.85rem; color: #6b7280;'>
                Powered by LangChain ‚Ä¢ Groq ‚Ä¢ ChromaDB ‚Ä¢ HuggingFace ‚Ä¢ Gradio
            </p>
            <div class='footer-credit'>üë®‚Äçüíª Developed by Shivam Kumar Yadav</div>
        </div>
        """)


        process_btn.click(
            fn=init_rag_pipeline,
            inputs=pdf_upload,
            outputs=status_box,
            show_progress=True
        )

        submit_btn.click(
            fn=process_query,
            inputs=query_input,
            outputs=[answer_output, sources_output],
            show_progress=True
        )

        query_input.submit(
            fn=process_query,
            inputs=query_input,
            outputs=[answer_output, sources_output],
            show_progress=True
        )

    return demo



In [15]:


print("\n" + "="*70)
print("üöÄ LAUNCHING ASKMYDOCS")
print("üë®‚Äçüíª Developed by Shivam Kumar Yadav")
print("="*70)


demo = create_interface()


demo.launch(
    share=True,
    debug=True,
    show_error=True
)


üöÄ LAUNCHING ASKMYDOCS
üë®‚Äçüíª Developed by Shivam Kumar Yadav
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1d70cc40038602e49d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/uvicorn/protocols/http/httptools_impl.py", line 409, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/uvicorn/middleware/proxy_headers.py", line 60, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/fastapi/applications.py", line 1139, in __call__
    await super().__call__(scope, receive, send)
  File "/usr/local/lib/python3.12/dist-packages/starlette/applications.py", line 107, in __call__
    await self.middleware_stack(scope, receive, send)
  File "/usr/local/lib/python3.12/dist-packages/starlette/middleware/errors.py", line 186, in __call__
    raise exc
  File "/usr/local/lib/python3.12/dist-packages/starlette/middleware/error

üì• Starting RAG Pipeline Initialization...
üìÑ Loading PDF document...




‚úì Document loaded: 1 pages
‚úÇÔ∏è Splitting document into chunks...




‚úì Document split into 1970 chunks
üî¢ Creating embeddings (this may take a minute)...
‚úì Embedding model loaded
üíæ Creating vector database...
‚úì Vector database created and persisted
üîç Initializing retriever...
‚úì Retriever ready
ü§ñ Initializing LLM...
‚úì LLM initialized
‚õìÔ∏è Building RAG chain...
‚úì RAG Pipeline initialized successfully!

üîç Processing query: what is logging
‚úì Query processed successfully

üîç Processing query: how to create virtural environments
‚úì Query processed successfully

üîç Processing query: how to manage packages with pip
‚úì Query processed successfully
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://1d70cc40038602e49d.gradio.live


