# RAG Pipeline

In [16]:
# Import libraries
import json
import openai
import os
from dotenv import load_dotenv
import psycopg2

In [17]:
# Environment variables
load_dotenv()

PG_HOST = os.getenv("POSTGRES_HOST")
PG_DB = os.getenv("POSTGRES_DB")
PG_USER = os.getenv("POSTGRES_USER")
PG_PASS = os.getenv("POSTGRES_PASSWORD")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

BASE_URL = "https://docs.ciroh.org"


In [18]:
# Databes connection
conn = psycopg2.connect(
    host=PG_HOST,
    database=PG_DB,
    user=PG_USER,
    password=PG_PASS
)

def execute_query(conn, query, params=None, fetch=False):
    cur = conn.cursor()
    """Execute a SQL query with optional parameters."""
    try:
        if params:
            cur.execute(query, params)
        else:
            cur.execute(query)
        if fetch:
            result = cur.fetchall()
            return result
        else:
            conn.commit()
    except Exception as e:
        print(f"Error executing query: {e}")
        conn.rollback()
        return None
    finally:
        cur.close()


In [19]:
# Initialize OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [20]:
def get_breadcrumb(conn, url_id):
    """
    Generates the breadcrumb trail for a given idurl using a recursive query.

    Args:
        conn: The database connection object.
        url_id: The idurl of the page for which to generate the trail.

    Returns:
        A string with the breadcrumb (e.g., "Home > Products > NGIAB"),
        or None if an error occurs or the idurl is not found.
    """

    query = """
    WITH RECURSIVE breadcrumb_path AS (
        -- Anchor Member: Select the starting page
        SELECT
            idurl,
            name,
            idurlparent,
            1 AS depth -- Initial depth level
        FROM
            tblurls
        WHERE
            idurl = %s

        UNION ALL

        -- Recursive Member: Join the table to find the parent
        SELECT
            u.idurl,
            u.name,
            u.idurlparent,
            bp.depth + 1 -- Increment depth at each level
        FROM
            tblurls u
        JOIN
            breadcrumb_path bp ON u.idurl = bp.idurlparent
    )
    -- Select the final result, aggregating the names into a single string
    SELECT
        string_agg(name, ' > ' ORDER BY depth DESC) AS breadcrumb
    FROM
        breadcrumb_path;
    """
    
    try:
        result = execute_query(conn, query, params=(url_id,), fetch=True)
        
        if result and result[0] and result[0][0]:
            return result[0][0]
        else:
            return None

    except Exception as e:
        print(f"An error occurred in get_breadcrumb for idurl {url_id}: {e}")
        return None

In [21]:
# Test questions
'''
questions = [
    "How can I get the meeting link to join the monthly CIROH office hours for AWS and cyberinfrastructure support?",
    "What are the main differences between the Anvil and Derecho supercomputers?",
    "What is the correct procedure for tagging an EC2 instance on AWS according to CIROH's convention?",
    "I need to run a job on the Pantarhei cluster. What is the maximum duration and core count I can request for a single job without needing special permission?",
    "I'm new to CIROH and want to start with hydrologic modeling. What is NextGen In A Box (NGIAB) and what are the main deployment options available?"
]
'''
questions = {
    "Getting Started & Navigation": [
        "What is CIROH DocuHub and how can I use it?",
        "How do I get started with CIROH services?",
        "Where can I find the getting started guide?",
        "How do I become a CIROH consortium member?"
    ],
    "Account & Access Requests": [
        "How do I request access to CIROH infrastructure?",
        "What are the requirements for using CIROH resources?",
        "How do I request a new CIROH account?",
        "How do I submit a request for new software installation?",
        "How do I request access to JupyterHub?",
        "How do I request GPU access for research?",
        "How do I request AWS or Google Cloud access through CIROH?",
        "How do I request access to the Pantarhei or Wukong HPC clusters?",
        "How do I request access to the NWM BigQuery API?"
    ],
    "Documentation & Training": [
        "Where can I find NextGen framework documentation?",
        "Where can I find documentation for a specific CIROH tool?",
        "Where can I access tutorials and training materials?",
        "What educational resources are available for CIROH users?"
    ],
    "Contributing Content": [
        "How do I contribute to CIROH DocuHub?",
        "How do I add my project documentation?",
        "Where do I submit tutorials or training materials?",
        "How do I publish my research findings on DocuHub?"
    ],
    "Services & Infrastructure: JupyterHub": [
        "How do I access CIROH JupyterHub?",
        "What’s the difference between Production, Staging, and Workshop JupyterHub?",
        "What software is pre-installed on JupyterHub?",
        "How do I stop my JupyterHub server when I’m not using it?",
        "How do I install custom software on JupyterHub?"
    ],
    "Services & Infrastructure: Cloud & HPC": [
        "How do I request AWS cloud credits through CIROH?",
        "What computing resources does CIROH offer?",
        "How do I use the Pantarhei HPC cluster?",
        "How do I access JetStream2?",
        "How do I request compute resources for a workshop or training?"
    ],
    "Services & Infrastructure: Data Access": [
        "How do I access the AORC dataset?",
        "Where can I find forcing data for hydrological models?",
        "How do I retrieve data for a specific region?",
        "What data formats are supported by CIROH?"
    ],
    "NextGen Framework": [
        "How do I get started with the NextGen framework?",
        "What is NextGen In A Box (NGIAB)?",
        "How do I run NextGen locally?",
        "How do I configure NextGen for my basin?",
        "How do I troubleshoot NextGen setup issues?",
        "My NextGen model isn’t running—what should I check?",
        "How do I resolve dependency conflicts in NextGen?"
    ],
    "Community & Collaboration": [
        "How do I connect with other CIROH researchers?",
        "How do I join CIROH office hours?",
        "How do I provide feedback on CIROH services?"
    ],
    "Technical Support": [
        "I can’t access my JupyterHub environment—what should I do?",
        "My data processing job failed—how can I troubleshoot?",
        "How do I optimize my computational resources?",
        "I need help with processing large datasets—where do I start?",
        "How do I report a bug or issue to CIROH IT support?"
    ],
    "Data & Workflow Best Practices": [
        "What are the best practices for managing CIROH datasets?",
        "How do I ensure reproducibility in my CIROH research?",
        "What tools are available for data visualization in CIROH?",
        "How do I convert between different hydrological data formats?"
    ],
    "Policies & Administration": [
        "What are CIROH’s usage policies?",
        "How much compute time am I allocated?",
        "What are the CIROH data sharing policies?",
        "Are there restrictions on using CIROH resources for commercial work?"
    ]
}



In [22]:
def get_embedding(text, dimensions, model=EMBEDDING_MODEL):
    """Call OpenAI to get an embedding for the given text."""
    try:
        response = openai.embeddings.create(
            input=text,
            model=model,
            dimensions=dimensions
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"❌ Error generating embedding: {e}")
        return None

In [23]:
def query_embedding(question_embedding, max_relevant_urls=3):
    """Queries the database to find the most relevant summaries."""
    # The query finds the top N most similar summaries using cosine similarity (<=>)
    responses = execute_query(conn, """
        SELECT idurl, summary_data ->> 'summary_text' as summary
        FROM tblurls
        WHERE summary_data IS NOT NULL
        ORDER BY embedding <=> %s::vector
        LIMIT %s
    """, params=(question_embedding, max_relevant_urls), fetch=True)
    
    # Return the database rows
    return responses

In [24]:
def build_rag_prompt(question, context_summaries):
    """Builds the prompt for the LLM to answer the question based on context."""
    
    # Joins the retrieved summaries into a single block of text
    context_str = "\n\n---\n\n".join(context_summaries)
    
    prompt = f"""
You are an expert AI assistant for the CIROH DocuHub. Your task is to answer the user's question based *only* on the provided context.

If the context does not contain the answer, state that you cannot answer the question with the information given. Do not use any external knowledge.

**CONTEXT:**
---
{context_str}
---

**QUESTION:**
{question}

**ANSWER:**
"""
    return prompt

In [None]:
# Get the final answer from the LLM
def get_rag_answer(prompt):
    """Calls the LLM to generate the final answer."""
    try:
        response = client.chat.completions.create(
            model="gpt-5", 
            messages=[
                {"role": "user", "content": prompt}
            ],
            reasoning_effort="minimal",
            verbosity="low"
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"❌ Error generating RAG answer: {e}")
        return None


In [26]:
def query_chunks(question_embedding, relevant_url_ids, max_relevant_chunks=5):
    """
    Queries the TBLContent table to find the most relevant chunks
    only within the pages identified in the first-level search.
    """
    # The query searches for the top 5 most similar chunks, but only
    # within the list of provided idurls.
    responses = execute_query(conn, """
        SELECT idurl, "order", Content
        FROM TBLContent
        WHERE idurl = ANY(%s)
        ORDER BY embedding <=> %s::vector
        LIMIT %s;
    """, params=(relevant_url_ids, question_embedding, max_relevant_chunks), fetch=True)

    return responses

In [27]:
# Parameters to control the RAG pipeline behavior
max_relevant_urls = 5
max_relevant_chunks = 10
verbose = False

In [28]:
# Initialize an empty list to store results
results_data = []

# Loop for RAG Pipeline (searching in TBLURLs only) with new dictionary structure
for topic, question_list in questions.items():
    print(f"\n=============================================")
    print(f"TOPIC: {topic}")
    print(f"=============================================\n")
    
    for i, question in enumerate(question_list):
        print(f"--- \n\n❓ PROCESSING QUESTION {i+1}: {question}\n")
            
        # 1. Get the embedding for the current question
        question_embedding = get_embedding(question, dimensions=1792)
            
        if question_embedding is None:
            print("Could not generate embedding for the question. Skipping.")
            continue
                
        # 2. Query the database to get relevant context
        retrieved_context = query_embedding(question_embedding, max_relevant_urls=max_relevant_urls)
            
        if not retrieved_context:
            print("No relevant context found in the database. Skipping.")
            continue
                
        # Extract just the summary texts for the prompt
        context_ids = [row[0] for row in retrieved_context]
        context_summaries = [row[1] for row in retrieved_context]

        if verbose:
            print("📚 Retrieved Context:")
            for summary in context_summaries:
                print(f"- {summary[:120]}...") # Print a snippet of each summary
            print()

        source_breadcrumbs = []
        for url_id in context_ids:
            breadcrumb = get_breadcrumb(conn, url_id)
            if breadcrumb:
                source_breadcrumbs.append(breadcrumb)

        # 3. Build the RAG prompt
        rag_prompt = build_rag_prompt(question, context_summaries)
            
        # 4. Get the final answer from the LLM
        final_answer = get_rag_answer(rag_prompt)
            
        if final_answer:
            print(f"✅ Final Answer:\n{final_answer}\n")

            sources_list = []
            if source_breadcrumbs:
                print("🔗 Sources:")
                unique_trails = list(dict.fromkeys(source_breadcrumbs))
                sources_list = unique_trails
                for trail in unique_trails:
                    print(f"- {trail}")
                print()
            
            # Create a dictionary for the current result
            result_entry = {
                "topic": topic,
                "question": question,
                "answer": final_answer,
                "sources": sources_list
            }
            results_data.append(result_entry)

output_file = "rag_results_by_url.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results_data, f, ensure_ascii=False, indent=4)

print(f"\n✅ All questions processed. Results saved to {output_file}")



TOPIC: Getting Started & Navigation

--- 

❓ PROCESSING QUESTION 1: What is CIROH DocuHub and how can I use it?

✅ Final Answer:
CIROH DocuHub is CIROH’s central entry point for documentation, services, and community resources supporting hydrologic research and operations. It orients you to CIROH cyberinfrastructure (e.g., CIROH JupyterHub on Google Cloud, AWS, Google Cloud, Pantarhei HPC), product docs (NextGen, Snow model, Tethys, NextGen In A Box), policies, training, blogs, and collaboration tools (CIROH Research Portal).

How to use it:
- Discover and follow documentation for models, tools, data processing, and best practices.
- Access computing resources via linked hubs and platforms.
- Join office hours and explore tutorials, blogs, and training.
- Contribute: click “Edit page” on docs.ciroh.org to propose changes via GitHub; submit blog posts and request product pages using the provided GitHub issue templates.
- Get support via Slack, email, or the site’s contact page.

🔗 Sour

In [29]:
# Initialize an empty list to store results
results_data = []

# RAG Pipeline Loop for knowledge searching in TBLURLs and TBLContent with new dictionary structure
for topic, question_list in questions.items():
    print(f"\n=============================================")
    print(f"TOPIC: {topic}")
    print(f"=============================================\n")

    for i, question in enumerate(question_list):
        print(f"--- \n\n❓ PROCESSING QUESTION {i+1}: {question}\n")
            
        # 1. Get the embedding for the current question
        question_embedding = get_embedding(question, dimensions=1792)
            
        if question_embedding is None:
            print("Could not generate embedding for the question. Skipping.")
            continue
                
        # --- LEVEL 1 SEARCH: Find relevant pages and their summaries ---
        retrieved_pages = query_embedding(question_embedding, max_relevant_urls=max_relevant_urls)

        if not retrieved_pages:
            print("No relevant pages found in the database. Skipping.")
            continue
                
        # Create a map of idurl -> summary from the initial results for efficient access
        page_summary_map = {row[0]: row[1] for row in retrieved_pages}
        context_ids = list(page_summary_map.keys())
        if verbose:
            print(f"📚 Level 1: Found relevant pages with IDs: {context_ids}")

        # --- LEVEL 2 SEARCH: Find relevant chunks within those pages ---
        retrieved_chunks = query_chunks(question_embedding, context_ids, max_relevant_chunks=max_relevant_chunks)

        final_context_list = []
        source_ids = []

        if not retrieved_chunks:
            # Fallback strategy: If no specific chunks are found, use the page summaries from Level 1
            if verbose:
                print("No specific chunks found, falling back to page summaries for context.")
            final_context_list = list(page_summary_map.values())
            source_ids = context_ids
        else:
            # --- NEW LOGIC: Group chunks by page and build rich, multi-source context ---
            if verbose:
                print(f"🎯 Level 2: Found {len(retrieved_chunks)} relevant chunks across pages.")
            
            chunks_by_page = {}
            for idurl, order, content in retrieved_chunks:
                if idurl not in chunks_by_page:
                    chunks_by_page[idurl] = []
                chunks_by_page[idurl].append(order)

            # Iterate through the original pages to maintain relevance order
            for page_id in context_ids:
                if page_id in chunks_by_page:
                    source_ids.append(page_id)
                    
                    # Get the page summary from the map (no new DB call)
                    page_summary = page_summary_map.get(page_id, "No summary available.")
                    
                    # Get the order numbers of relevant chunks for this page
                    relevant_orders = chunks_by_page[page_id]
                    
                    # Expand to include neighbors and remove duplicates
                    orders_with_neighbors = set()
                    for order_num in relevant_orders:
                        orders_with_neighbors.add(order_num - 1)
                        orders_with_neighbors.add(order_num)
                        orders_with_neighbors.add(order_num + 1)
                    
                    # Fetch all unique chunks (originals + neighbors) in correct document order
                    expanded_chunk_rows = execute_query(conn, """
                        SELECT Content FROM TBLContent
                        WHERE idurl = %s AND "order" = ANY(%s)
                        ORDER BY "order" ASC;
                    """, params=(page_id, list(orders_with_neighbors)), fetch=True)
                    
                    if expanded_chunk_rows:
                        detailed_context = "\n\n".join([row[0] for row in expanded_chunk_rows])
                        
                        # Combine summary and detailed context for this page
                        page_context = f"Source Page Summary:\n{page_summary}\n\nDetailed Information from this page:\n{detailed_context}"
                        final_context_list.append(page_context)

        if not final_context_list:
            print("Could not build any context. Skipping.")
            continue

        if verbose:
            print("\n📝 Final Combined Context for LLM:")
            for idx, ctx in enumerate(final_context_list):
                print(f"--- Context Block {idx+1} ---\n{ctx[:10000]}...\n")

        # 3. Build the RAG prompt with the new, multi-source context
        rag_prompt = build_rag_prompt(question, final_context_list)
            
        # 4. Get the final answer from the LLM
        final_answer = get_rag_answer(rag_prompt)
            
        if final_answer:
            print(f"✅ Final Answer:\n{final_answer}\n")

            # Get breadcrumbs for all source pages used in the context
            source_breadcrumbs = []
            for url_id in source_ids:
                breadcrumb = get_breadcrumb(conn, url_id)
                if breadcrumb:
                    source_breadcrumbs.append(breadcrumb)

            sources_list = []
            if source_breadcrumbs:
                print("🔗 Sources:")
                unique_trails = list(dict.fromkeys(source_breadcrumbs))
                sources_list = unique_trails
                for trail in unique_trails:
                    print(f"- {trail}")
                print()

            # Create a dictionary for the current result
            result_entry = {
                "topic": topic,
                "question": question,
                "answer": final_answer,
                "sources": sources_list
            }
            results_data.append(result_entry)

output_file = "rag_results_by_chunk.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results_data, f, ensure_ascii=False, indent=4)

print(f"\n✅ All questions processed. Results saved to {output_file}")



TOPIC: Getting Started & Navigation

--- 

❓ PROCESSING QUESTION 1: What is CIROH DocuHub and how can I use it?

✅ Final Answer:
CIROH DocuHub is CIROH’s centralized gateway to documentation, services, and community resources that support hydrologic research and operations. It links you to computing (CIROH-2i2c JupyterHub on Google Cloud, CIROH AWS/Google Cloud, Pantarhei HPC), product docs (e.g., NextGen, Snow model, Tethys, NGIAB), policies, training, blogs, and collaboration tools (CIROH Portal).

How to use it:
- Browse Products to access documentation and tutorials: https://docs.ciroh.org/docs/products/intro
- Launch computing resources:
  - CIROH-2i2c JupyterHub: https://docs.ciroh.org/docs/services/cloudservices/2i2c
  - Pantarhei HPC: https://docs.ciroh.org/docs/services/on-prem/Pantarhei
  - Google Cloud resources: https://docs.ciroh.org/docs/services/cloudservices/google-cloud
- Use the CIROH Portal for apps, datasets, courses, and publications: https://portal.ciroh.org/
- C

In [None]:
def build_standard_prompt(question):
    """
    Builds a simple prompt asking the LLM to answer based on its knowledge of a website.
    """
    prompt = f"""
Based on the information available on the website http://docs.ciroh.org, please answer the following question.

Question: "{question}"
"""
    return prompt

def get_standard_gpt_answer(prompt):
    """
    Calls the LLM with standard parameters to get a general knowledge-based answer.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-5",
            messages=[
                {"role": "user", "content": prompt}
            ]
            # No 'reasoning_effort' or 'verbosity' parameters are used here
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"❌ Error generating standard GPT answer: {e}")
        return None

In [None]:
# Initialize an empty list to store the results
results_data = []

for topic, question_list in questions.items():
    print(f"\n=============================================")
    print(f"TOPIC: {topic}")
    print(f"=============================================\n")
    
    for i, question in enumerate(question_list):
        print(f"--- \n\n❓ PROCESSING QUESTION {i+1}: {question}\n")
            
        # 1. Build the standard prompt
        standard_prompt = build_standard_prompt(question)
            
        # 2. Get the answer from the standard LLM call
        final_answer = get_standard_gpt_answer(standard_prompt)
            
        if final_answer:
            print(f"✅ Standard GPT-5 Answer:\n{final_answer}\n")

            # Create a dictionary for the current result
            result_entry = {
                "topic": topic,
                "question": question,
                "answer": final_answer,
                "sources": []  # Empty list as no sources are retrieved
            }
            # Add the result to our main list
            results_data.append(result_entry)

# After the loop is complete, save all results to a new JSON file
output_filename = "rag_results_standard_gpt.json"
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(results_data, f, ensure_ascii=False, indent=4)

print(f"\n✅ All questions processed. Benchmark results saved to {output_filename}")


TOPIC: Getting Started & Navigation

--- 

❓ PROCESSING QUESTION 1: What is CIROH DocuHub and how can I use it?

✅ Standard GPT-4o Answer:
I'm sorry, but I can't access external websites directly, including http://docs.ciroh.org. However, based on typical functions, a "DocuHub" might refer to a centralized documentation platform where users can access, manage, and collaborate on various documents, whether for software, research, or other collaborative projects. To use it effectively, you would generally register for an account (if required), navigate through available documentation, and use search features to find specific information. For detailed instructions on usage, I recommend visiting the website directly or looking for tutorials or guides that they provide.

--- 

❓ PROCESSING QUESTION 2: How do I get started with CIROH services?

✅ Standard GPT-4o Answer:
I'm sorry, but I cannot access specific content from external websites such as http://docs.ciroh.org directly. However, if

KeyboardInterrupt: 

In [30]:
# Close the database connection
conn.close()