Credits: Ed Donner  Follow this course. https://www.udemy.com/course/llm-engineering-master-ai-and-large-language-models/?couponCode=KEEPLEARNING

In [1]:
## Prep1: import all the necessary libraries

# Import OS module for file and directory operations (paths, file checks, etc.)
import os

# Import Gradio for creating the web-based chat UI interface
import gradio as gr

# Import Anthropic SDK for accessing Claude AI API
import anthropic

# Import load_dotenv to read API keys from .env file
from dotenv import load_dotenv

# Import text splitter to break documents into smaller chunks for embedding
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Import TextLoader to load and process text files
from langchain_community.document_loaders import TextLoader

# Import OpenAI embeddings to convert text into vector representations
from langchain_openai import OpenAIEmbeddings  # pip install langchain-openai

# Import Chroma vector database for storing and retrieving document embeddings
from langchain_chroma import Chroma           # pip install langchain-chroma

# Import Anthropic LLM wrapper from LangChain (alternative Claude interface)
from langchain_community.llms import Anthropic

# Import datetime for handling file modification timestamps
from datetime import datetime

# Import PyPDF2 for reading and extracting text from PDF files
from PyPDF2 import PdfReader

# import re for regular expressions
import re


# Enable debug mode to print detailed execution information during runtime
DEBUG_MODE=True

In [2]:
## Prep2: setting the variables
##Run this script in the directory containing the dictionaries
# ==== Configuration ====
current_directory = os.getcwd()
FILES_DIR = os.path.join(current_directory, "SirusAI").lower()
if DEBUG_MODE:
    print(f"The directory for the files is: {FILES_DIR}")

V_CHROMA_DB_FILE = "chroma.sqlite3"
V_CHROMA_DB_FILE_DIRECTORY = "ChromaDB"
V_CLAUDE_MODEL = "claude-haiku-4-5"

CHROMA_DB_DIR = os.path.join(current_directory, V_CHROMA_DB_FILE_DIRECTORY)
if DEBUG_MODE:
    print(f"The directory for the Chroma database is: {CHROMA_DB_DIR}")

CHUNK_SIZE = 500
                # For general RAG applications, 300-600 tokens is a good balance.
                # For technical/legal documents, use larger chunks (600+).
                # For FAQs or structured text, smaller chunks (200-400) work better.
CHUNK_OVERLAP = 55   # 10 to 20% of chunk size 
TOP_K = 5         # Number of chunks to retrieve
COLLECTION_NAME = "sirusai_knowledgebase"  # Arbitrary name for the Chroma collection, a chroma db can have multiple collections



The directory for the files is: d:\programs\newprojects\sirus-ai\alphacentauri\rag\sirusai
The directory for the Chroma database is: d:\Programs\newprojects\Sirus-AI\alphacentauri\RAG\ChromaDB


In [3]:
## 0) Check latest time that the Chroma database was updated ====
## so that files updated after this time will be loaded into the vectordatabase

def get_last_vdb_update():
    file_path = os.path.join(CHROMA_DB_DIR,V_CHROMA_DB_FILE)
    # Check if the file exists
    if os.path.exists(file_path):
        # Get the last modified time in seconds since epoch
        last_modified_time = os.path.getmtime(file_path)
        # Convert to a readable format
        last_modified_date = datetime.fromtimestamp(last_modified_time)
        if DEBUG_MODE:
            print("Last Modified Date:", last_modified_date)
    else:
        last_modified_time = 0
        if DEBUG_MODE:
            print("The chroma database file does not exist")
        last_modified_date = datetime.fromtimestamp(last_modified_time)

    # return last_modified_time
    # always return 0 till we have updated the updating of the vectordabase to be incremental AND remove old records
    return 0

In [4]:
print(get_last_vdb_update())

Last Modified Date: 2025-11-14 19:09:26.488613
0


In [5]:
## 1. Fucntion to load the files that are updated since a specific time from the directory
##### Loads all the files as text and puts them in a list of dictionaries with the content and metadata

def load_files(directory, last_vdb_update=0):
    """
    Reads .txt and .pdf files from directory and subdirectories.
    - TXT files: read directly
    - PDF files: converted to text using PyPDF2 before adding
    Returns list of dicts with 'content' and 'metadata' (source filename, page numbers)
    """
    documents = []
    step_counter = 0

    if DEBUG_MODE:
        print("Loading the txt & pdf files ...")

    for root, _, files in os.walk(directory):
        for filename in files:
            # --- Select allowed file types ---
            if not (filename.endswith(".txt") or filename.endswith(".pdf")):
                continue

            step_counter += 1
            if DEBUG_MODE and step_counter < 10:
                print(filename)

            filepath = os.path.join(root, filename)
            file_modified_time = os.path.getmtime(filepath)

            # Skip older files ONLY LOAD NEWER FILES
            if file_modified_time <= last_vdb_update:
                continue

            # --- TXT HANDLING ---
            if filename.endswith(".txt"):
                with open(filepath, "r", encoding="utf-8") as file:
                    content = file.read()
                    documents.append({
                        "content": content,
                        "metadata": {
                            "source": filename,
                            "type": "text"
                        }
                    })

            # --- PDF HANDLING ---
            elif filename.endswith(".pdf"):
                try:
                    reader = PdfReader(filepath)
                    pdf_text = []

                    for page_num, page in enumerate(reader.pages, start=1):
                        content = page.extract_text() or ""
                        if content.strip():  # Only add non-empty pages
                            pdf_text.append({
                                "content": content,
                                "page": page_num
                            })

                    # Combine all pages with page numbers preserved
                    if pdf_text:
                        # Store entire document with page-by-page breakdown
                        full_content = "\n\n".join([f"[Page {p['page']}]\n{p['content']}" for p in pdf_text])
                        documents.append({
                            "content": full_content,
                            "metadata": {
                                "source": filename,
                                "type": "pdf",
                                "total_pages": len(pdf_text)
                            }
                        })
                    else:
                        if DEBUG_MODE:
                            print(f"‚ö†Ô∏è Warning: PDF produced no text ‚Üí {filename}")

                except Exception as e:
                    if DEBUG_MODE:
                        print(f"‚ùå Error reading PDF {filename}: {e}")

    if DEBUG_MODE:
        print(f"Number of documents added: {len(documents)}")
        if documents:
            print(f"Sample document structure: {list(documents[0].keys())}")
            print(f"Sample metadata: {documents[0]['metadata']}")

    return documents

In [6]:
## cheating as i'm setting the last_vdb_update to 0
docs = load_files(FILES_DIR, last_vdb_update=0)
print(docs[3])

Loading the txt & pdf files ...
SirusAI - AI Als Hefboom.pdf
SirusAI - Brand Manifesto.pdf
SirusAI - Brand Voice Guidelines.pdf
SirusAI - Business Plan.pdf
SirusAI - Offering.pdf
SirusAI - Positionering.pdf
SirusAI - WaardeVoorstel.pdf
SirusAI-ICP-Definitie-Federaties.pdf
SirusAI-ICP-Definitie-KMO.pdf
Number of documents added: 10
Sample document structure: ['content', 'metadata']
Sample metadata: {'source': 'SirusAI - AI Als Hefboom.pdf', 'type': 'pdf', 'total_pages': 24}
{'content': '[Page 1]\nBusinessplan SirusAI  \nInnovatieve AI -oplossingen voor een digitale toekomst  \nContents  \n1. Executive Summary ................................ ................................ ................................ ................  2 \nHero Line  ................................ ................................ ................................ ................................ .......  2 \nWat we doen  ................................ ................................ ............................

In [7]:
def extract_page_number(text):
    """
    Extracts the page number from a string containing [Page X] pattern.
    
    Args:
        text (str): The input string to search for page number
        
    Returns:
        int: The page number from the first occurrence of [Page X], or -1 if not found
    """
    match = re.search(r'\[Page (\d+)\]', text)
    
    if match:
        return int(match.group(1))
    else:
        return -1

# Test the function
print("Test 1:", extract_page_number("[Page 14] This is some text"))  # Expected: 14
print("Test 2:", extract_page_number("Some text [Page 5] more text [Page 10]"))  # Expected: 5
print("Test 3:", extract_page_number("No page number here"))  # Expected: -1

Test 1: 14
Test 2: 5
Test 3: -1


In [17]:
# 2) Process: splits (chunking) and encodes documents into ChromaDB ====
def process_documents():
    """Loads, splits, and stores documents into ChromaDB with metadata."""
    print("üîç Loading documents... from location: ", FILES_DIR)
    ## keep in mind that at this point it's incremental read.
    raw_docs = load_files(FILES_DIR)

    # Split text into chunks while preserving metadata
    print("üîç Splitting documents... into chunks of ", CHUNK_SIZE, " with an overlap of ", CHUNK_OVERLAP)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    
    doc_chunks = []
    metadata_list = []
    
    for doc in raw_docs:
        # Split the content
        chunks = splitter.split_text(doc["content"])
        last_found_page = 1
        # For each chunk, preserve the source metadata
        for i, chunk in enumerate(chunks):
            page_number = extract_page_number(chunk)
            if page_number != -1:
                #print the page number if found in the chunk
                print("## Page number: ", page_number)
                last_found_page = page_number
            else:
                print("## No page number found in the chunk. Adding to chunk")
                chunk = " [Page " + str(last_found_page) + "]" + chunk
                
            print("## Chunk: #",i,": ", chunk)
            #print the metadata
            print("## Metadata: ", doc["metadata"])
            #add the chunk to the list
            doc_chunks.append(chunk)
            # Add metadata including chunk number
            chunk_metadata = doc["metadata"].copy()
            chunk_metadata["chunk_id"] = i + 1
            chunk_metadata["total_chunks"] = len(chunks)
            metadata_list.append(chunk_metadata)

    print(f"üìÑ {len(doc_chunks)} document chunks processed from {len(raw_docs)} documents.")

    # (A) Use from_texts to create or add to a collection in Chroma with metadata
    print("‚öôÔ∏è Initializing ChromaDB with from_texts()... on location: ", CHROMA_DB_DIR)
    vectorstore = Chroma.from_texts(
        texts=doc_chunks,
        metadatas=metadata_list,  # Pass metadata for each chunk
        embedding=OpenAIEmbeddings(),  # the encoding
        persist_directory=CHROMA_DB_DIR,
        collection_name=COLLECTION_NAME
    )

        # (B) If needed, we can force persist the underlying DB:
    # vectorstore._client.persist()  # (internal method, optional)

    print(f"‚úÖ {len(doc_chunks)} records with metadata loaded into ChromaDB collection '{COLLECTION_NAME}'.")

In [19]:
process_documents()

IndentationError: unindent does not match any outer indentation level (<string>, line 15)

In [None]:
# ==== 3) Query ChromaDB for retrieval ====
def retrieve_context(query):
    """Retrieves relevant document chunks from ChromaDB with metadata."""
    # (A) Re-initialize Chroma using the same collection name & directory
    vectorstore = Chroma(
        embedding_function=OpenAIEmbeddings(),
        persist_directory=CHROMA_DB_DIR,
        collection_name=COLLECTION_NAME
    )

    # (B) similarity_search to retrieve top_k
    results = vectorstore.similarity_search(query, k=TOP_K)
    
    # Print debug info
    if DEBUG_MODE:
        print(f"\nüîé Found {len(results)} relevant segment(s) for query: '{query}'")
        for i, r in enumerate(results, 1):
            print(f"   [{i}] Source: {r.metadata.get('source', 'Unknown')}, Chunk {r.metadata.get('chunk_id', '?')}/{r.metadata.get('total_chunks', '?')}")
            if r.metadata.get('type') == 'pdf':
                # Try to extract page number from content if it has [Page X] markers
                import re
                page_matches = re.findall(r'\[Page (\d+)\]', r.page_content)
                if page_matches:
                    print(f"       Page marker(s) found: Page {', '.join(page_matches)}")

    return results

In [None]:
vector_store_result =retrieve_context("Wat is SirusAI?")
print("type of vector_store_result", type(vector_store_result)," ", "with ",len(vector_store_result)," elements")
print(vector_store_result)

In [None]:
# ==== 4) Query Anthropic Claude with Streaming ====
def ask_llm_stream(prompt):
    """Call Anthropic Claude with streaming support."""
    with client.messages.stream(
        model=V_CLAUDE_MODEL,
        max_tokens=1000,
        temperature=0.7,
        messages=[{"role": "user", "content": prompt}]
    ) as stream:
        for text in stream.text_stream:
            yield text

# def ask_llm(prompt):
#     """Non-streaming version for backward compatibility."""
#     response = client.messages.create(
#         model=v_claude_model,
#         max_tokens=1000,
#         temperature=0.7,
#         messages=[{"role": "user", "content": prompt}]
#     )
#     return response.content[0].text

In [None]:
# ==== Helper: Format Sources for Display ====
def format_sources_list(results):
    """
    Creates a formatted list of all sources used in the response.
    Can be appended to the chatbot response for transparency.
    """
    import re
    
    if not results:
        return ""
    
    sources_info = []
    seen_sources = set()
    
    for i, result in enumerate(results, 1):
        source_name = result.metadata.get('source', 'Unknown Source')
        
        # Extract page number from content if available
        page_info = ""
        if result.metadata.get('type') == 'pdf':
            # Search for ALL page markers in the chunk content (not just first 200 chars)
            # page_match = re.search(r'\[Page (\d+)\]', r.page_content[:100])
            # if page_match:
            #     print(f"       Page marker found: Page {page_match.group(1)}")
            page_matches = re.findall(r'\[Page (\d+)\]', result.page_content)
            if page_matches:
                # If chunk spans multiple pages, use the first page number
                page_info = f" (Page {page_matches[0]})"
        
        # Clean source name
        # clean_source = source_name.replace('.pdf', '').replace('.txt', '')
        clean_source = source_name
        full_source = f"{clean_source}{page_info}"
        
        # Avoid duplicate source listings
        if full_source not in seen_sources:
            sources_info.append(f"**Source {i}**: {full_source}")
            seen_sources.add(full_source)
    
    if sources_info:
        return "\n\n---\nüìö **Sources Used:**\n" + "\n".join(sources_info)
    return ""


In [None]:
# ==== 5) Construct the full query and call Claude ====
def construct_and_query_stream(query, results):
    """Constructs a query for Claude with streaming support and source citations."""
    import re
    
    # Build context with source citations
    context_parts = []
    for i, result in enumerate(results, 1):
        source_name = result.metadata.get('source', 'Unknown Source')
        chunk_id = result.metadata.get('chunk_id', '?')
        
        # Extract page number from content if it's a PDF with [Page X] markers
        page_info = ""
        if result.metadata.get('type') == 'pdf':
            
            # Search for ALL page markers in the chunk (not just first 100 chars)
            page_matches = re.findall(r'\[Page (\d+)\]', result.page_content)
            if page_matches:
                # Use the first page number if chunk spans multiple pages
                page_info = f", Page {page_matches[0]}"
        
        # Clean source name (remove .pdf extension for cleaner display)
        # clean_source = source_name.replace('.pdf', '').replace('.txt', '')
        clean_source = source_name
        
        # Format: [Source 1: Document Name, Page X]
        source_label = f"[Sourcebb {i}: {clean_source}{page_info}]"
        
        # Add labeled context
        context_parts.append(f"{source_label}\n{result.page_content}\n")
    
    context = "\n".join(context_parts)
    
    prompt = f"""You are an AI assistant using a Retrieval-Augmented Generation (RAG) system.

IMPORTANT INSTRUCTIONS:
1. Only answer the question based on the provided context below
2. If you don't have enough information in the context, say so clearly
3. ALWAYS cite your sources using the [Source X] labels provided in the context
4. When referencing information, add the source citation at the end of the relevant sentence or paragraph
5. You can cite multiple sources if the information comes from different documents
6. Format citations like this: (Source 1), (Source 2), or (Source 1, Source 3)

CONTEXT WITH SOURCES:
{context}

QUESTION:
{query}

ANSWER (remember to cite sources):
"""
    if DEBUG_MODE:
        print(f"\nü§ñ Sending prompt to Claude with {len(results)} segment(s).")
    
    # Stream the response
    for text_chunk in ask_llm_stream(prompt):
        yield text_chunk

# def construct_and_query(query, results):
#     """Non-streaming version for backward compatibility."""
#     context = "\n".join(result.page_content for result in results)
#     prompt = f"""You are an AI assistant using a Retrieval-Augmented Generation (RAG) system.
# Only answer the question if you know the answer, don't make it up. If you don't have the answer, say it.
# Use the following context to answer the user's question:

# CONTEXT:
# {context}

# QUESTION:
# {query}

# ANSWER:
# """
#     if DEBUG_MODE:
#         print(f"\nü§ñ Sending prompt to Claude with {len(results)} segment(s).")
#     return ask_llm(prompt)

In [None]:
# ==== 6) Build Gradio UI with Streaming ====
def chatbot_ui_stream(message, history):
    """Handles user queries through the Gradio UI with streaming and source citations."""
    vdb_results = retrieve_context(message)
    if DEBUG_MODE:
        print(f"\nüí¨ User input: {message}")
        print(f"üìö VDB results: {len(vdb_results)} segments retrieved")
    
    # Stream the response from Claude
    full_response = ""
    for text_chunk in construct_and_query_stream(message, vdb_results):
        full_response += text_chunk
        yield full_response
    
    # After streaming is complete, append the formatted sources list
    sources_list = format_sources_list(vdb_results)
    if sources_list:
        full_response += sources_list
        yield full_response

# def chatbot_ui(user_input):
#     """Non-streaming version for backward compatibility."""
#     vdb_results = retrieve_context(user_input)
#     if DEBUG_MODE:
#         print("chatbot_ui: user input" + str(user_input))
#         print("chatbot_ui: vdb result" + str(vdb_results))
#     response = construct_and_query(user_input, vdb_results)
#     return response

In [None]:
# ==== Main ====
if DEBUG_MODE:
    print("###########################################################")
    print("## DEBUG_MODE = TRUE")
    print("###########################################################")
load_dotenv(override=True)      ### always override the system environment variables
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')
if not ANTHROPIC_API_KEY:
    raise ValueError("‚ùå ERROR: Missing Anthropic API Key. Add it to your .env file.")
    sys.exit(0)
os.environ['ANTHROPIC_API_KEY'] = ANTHROPIC_API_KEY
client = anthropic.Anthropic()

# Load OpenAI API Key (needed for embeddings)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    raise ValueError("‚ùå ERROR: Missing OpenAI API Key. Add it to your .env file.")
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY



In [None]:
# ==== Create Beautiful Gradio Interface with Streaming ====

# Custom CSS for professional, colorful design
custom_css = """
.gradio-container {
    background: linear-gradient(135deg, #EEEEEEE 0%, #FFFFFF 100%) !important;
    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
}

/* HEADER BOX */
.app-header {
    text-align: center;
    padding: 2rem;
    background: rgba(255, 255, 255, 0.95);
    border-radius: 20px;
    margin: 2rem 2rem;
    box-shadow: 0 3px 16px rgba(0, 12, 110, 0.3) !important;
}

.app-title {
    font-size: 3rem;
    font-weight: bold;
    background: linear-gradient(135deg, #000000 0%, #ff5500 100%);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    margin-bottom: 0.5rem;
}

.app-subtitle {
    font-size: 1.2rem;
    color: #333;
    margin-top: 0.5rem;
}

/* CHATBOX */
#chatbot {
    height: 500px !important;
    border-radius: 15px !important;
    box-shadow: 0 8px 32px rgba(0, 12, 110, 0.3) !important;
}

.message-wrap {
    font-size: 1.1rem !important;
    line-height: 1.6 !important;
}

.user-message {
    background: linear-gradient(135deg, #000C6E 0%, #0015a0 100%) !important;
    box-shadow: 0 8px 32px rgba(0, 12, 110, 0.3) !important;
}

.bot-message {
    background: linear-gradient(135deg, #ff5500 0%, #ff7733 100%) !important;
}

/* MAIN PAGE CONTAINER */
#component-0 {
    background: rgba(255, 255, 255, 0.95);
    border-radius: 20px;
    padding: 2rem;
    box-shadow: 0 8px 32px rgba(0, 12, 110, 0.3) !important;
}

/* SIDEBAR */
#sidebar {
    background: rgba(255, 255, 255, 0.95);
    padding: 1.5rem;
    border-radius: 15px;
    box-shadow: 0 8px 32px rgba(0, 12, 110, 0.3) !important;
}

/* TEXTBOX */
.input-textbox {
    font-size: 1.1rem !important;
    border-radius: 10px !important;
    box-shadow: 0 8px 32px rgba(0, 12, 110, 0.3) !important;
}

/* BUTTONS */
button {
    border: none !important;
    border-radius: 10px !important;
    font-color: white !important;
    font-weight: bold !important;
    font-size: 1rem !important;
    padding: 0.8rem 2rem !important;
    transition: transform 0.2s !important;
    box-shadow: 0 8px 32px rgba(0, 12, 110, 0.3) !important;
}

button:hover {
    transform: scale(1.05) !important;
    background: linear-gradient(135deg, #ff7733 0%, #001aa0 100%) !important;
}

/* FOOTER */
.footer {
    text-align: center;
    padding: 1rem;
    color: white;
    font-size: 0.9rem;
}

"""

In [None]:
# ==== Bouw de Gradio UI Interface ====

# Maak de hoofdinterface met custom CSS styling en Soft theme
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
    
    # Header sectie: toon de titel en subtitel van de applicatie
    with gr.Row():
        gr.HTML("""
            <div class="app-header">
                <h1 class="app-title">SirusAI Personal Assistant</h1>
                <p class="app-subtitle">Your RAG-powered business partner.</p>
            </div>
        """)
    
    # Hoofdchat interface: 2-kolommen layout (sidebar + chat)
    with gr.Row():
        # Linker kolom (scale=1): sidebar met tips en features
        with gr.Column(scale=1):
            gr.Markdown("""
            ### üí° Tips
            - Ask specific questions about your documents
            - Use natural language
            - The AI retrieves relevant context automatically
            
            ### üìö Features
            - ‚úÖ Real-time streaming responses
            - ‚úÖ Context-aware answers
            - ‚úÖ PDF & TXT support
            - ‚úÖ Smart document retrieval
            """)
    
        # Rechter kolom (scale=3): chatbot interface (3x breder dan sidebar)
        with gr.Column(scale=3):
            # Chatbot component: toont de conversatie geschiedenis
            chatbot = gr.Chatbot(
                elem_id="chatbot",
                label="üí¨ Chat with SirusAI",
                height=500,
                show_label=True,
                bubble_full_width=False,
                show_copy_button=True
            )
            
            # Input rij: tekstbox voor gebruikersinvoer + verzendknop
            with gr.Row():
                msg = gr.Textbox(
                    placeholder="Ask me anything about your documents in your database... ",
                    show_label=False,
                    scale=4,
                    container=False,
                    lines=2
                )
                submit_btn = gr.Button("Send", scale=1, variant="primary")
            
            # Actie knoppen rij: clear en retry functionaliteit
            with gr.Row():
                clear_btn = gr.Button("Clear Chat", scale=1)
                retry_btn = gr.Button("Retry", scale=1)
    
    # Footer sectie: toon credits en copyright informatie
    with gr.Row():
        gr.HTML("""
            <div class="footer">
                <p>Built with ‚ù§Ô∏è using LangChain, ChromaDB, and Claude AI | ¬© 2025 SirusAI</p>
            </div>
        """)
    
    # ==== Event Handler Functies ====
    
    # # User functie: verwerk gebruikersinput en voeg toe aan chat geschiedenis
    def user(user_message, history):
        return "", history + [[user_message, None]]
    
    # Bot response functie: genereer en stream het antwoord van de AI
    def bot_response(history):
        user_message = history[-1][0]  # Haal laatste gebruikersbericht op
        bot_message = ""
        history[-1][1] = ""  # Initialiseer bot antwoord als leeg
        
        # Stream het antwoord chunk per chunk voor real-time weergave
        for chunk in chatbot_ui_stream(user_message, history):
            history[-1][1] = chunk
            yield history
    
    # ==== Verbind UI Events met Handler Functies ====
    # Enter toets in tekstbox: verwerk input en genereer antwoord
    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot_response, chatbot, chatbot
    )
    # Send knop: zelfde functionaliteit als Enter toets
    submit_btn.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot_response, chatbot, chatbot
    )
    # Clear knop: wis de volledige chat geschiedenis
    clear_btn.click(lambda: None, None, chatbot, queue=False)
    # Retry knop: verwijder laatste bericht en probeer opnieuw
    retry_btn.click(lambda history: history[:-1] if history else history, chatbot, chatbot, queue=False).then(
        bot_response, chatbot, chatbot
    )


In [None]:
# ==== TEST: Source Grounding ====
# This cell demonstrates how the source citations work

# Test query
test_query = "Wat is het waardevoorstel van SirusAI?"

# Retrieve context with metadata
print("=" * 70)
print("üìã TESTING SOURCE GROUNDING FEATURE")
print("=" * 70)
print(f"\nQuery: {test_query}\n")

# Get results from vector database
test_results = retrieve_context(test_query)

# Show the formatted sources
print("\n" + "=" * 70)
print("üìö SOURCES THAT WILL BE CITED:")
print("=" * 70)
sources_display = format_sources_list(test_results)
print(sources_display)

# Show a sample of how context is formatted for Claude
print("\n" + "=" * 70)
print("üìù SAMPLE CONTEXT FORMAT (first 500 chars):")
print("=" * 70)
if test_results:
    import re
    sample_result = test_results[0]
    source_name = sample_result.metadata.get('source', 'Unknown Source').replace('.pdf', '').replace('.txt', '')
    page_match = re.search(r'\[Page (\d+)\]', sample_result.page_content[:100])
    page_info = f", Page {page_match.group(1)}" if page_match else ""
    
    print(f"[Source 1: {source_name}{page_info}]")
    print(sample_result.page_content[:500] + "...")

print("\n" + "=" * 70)
print("‚úÖ Source grounding is properly configured!")
print("   - Metadata is preserved through the pipeline")
print("   - Sources are labeled and tracked")
print("   - Claude will be instructed to cite these sources")
print("=" * 70)


In [None]:
# ==== Start de Gradio Interface ====
demo.queue()  # Activeer queue voor streaming support
demo.launch(
    share=False,      # Geen publieke link genereren
    show_error=True   # Toon errors in de UI
)
