# RAG Pipeline

In [None]:
# Import libraries
import json
import openai
import os
from dotenv import load_dotenv
import psycopg2

In [None]:
# Environment variables
load_dotenv()

PG_HOST = os.getenv("POSTGRES_HOST")
PG_DB = os.getenv("POSTGRES_DB")
PG_USER = os.getenv("POSTGRES_USER")
PG_PASS = os.getenv("POSTGRES_PASSWORD")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")

BASE_URL = "https://docs.ciroh.org"


In [None]:
# Databes connection
conn = psycopg2.connect(
    host=PG_HOST,
    database=PG_DB,
    user=PG_USER,
    password=PG_PASS
)

def execute_query(conn, query, params=None, fetch=False):
    cur = conn.cursor()
    """Execute a SQL query with optional parameters."""
    try:
        if params:
            cur.execute(query, params)
        else:
            cur.execute(query)
        if fetch:
            result = cur.fetchall()
            return result
        else:
            conn.commit()
    except Exception as e:
        print(f"Error executing query: {e}")
        conn.rollback()
        return None
    finally:
        cur.close()


In [None]:
# Initialize OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

In [None]:
def get_breadcrumb(conn, url_id):
    """
    Generates the breadcrumb trail for a given idurl using a recursive query.

    Args:
        conn: The database connection object.
        url_id: The idurl of the page for which to generate the trail.

    Returns:
        A string with the breadcrumb (e.g., "Home > Products > NGIAB"),
        or None if an error occurs or the idurl is not found.
    """

    query = """
    WITH RECURSIVE breadcrumb_path AS (
        -- Anchor Member: Select the starting page
        SELECT
            idurl,
            name,
            idurlparent,
            1 AS depth -- Initial depth level
        FROM
            tblurls
        WHERE
            idurl = %s

        UNION ALL

        -- Recursive Member: Join the table to find the parent
        SELECT
            u.idurl,
            u.name,
            u.idurlparent,
            bp.depth + 1 -- Increment depth at each level
        FROM
            tblurls u
        JOIN
            breadcrumb_path bp ON u.idurl = bp.idurlparent
    )
    -- Select the final result, aggregating the names into a single string
    SELECT
        string_agg(name, ' > ' ORDER BY depth DESC) AS breadcrumb
    FROM
        breadcrumb_path;
    """
    
    try:
        result = execute_query(conn, query, params=(url_id,), fetch=True)
        
        if result and result[0] and result[0][0]:
            return result[0][0]
        else:
            return None

    except Exception as e:
        print(f"An error occurred in get_breadcrumb for idurl {url_id}: {e}")
        return None

In [None]:
# Test questions
questions = [
    "How can I get the meeting link to join the monthly CIROH office hours for AWS and cyberinfrastructure support?",
    "What are the main differences between the Anvil and Derecho supercomputers?",
    "What is the correct procedure for tagging an EC2 instance on AWS according to CIROH's convention?",
    "I need to run a job on the Pantarhei cluster. What is the maximum duration and core count I can request for a single job without needing special permission?",
    "I'm new to CIROH and want to start with hydrologic modeling. What is NextGen In A Box (NGIAB) and what are the main deployment options available?"
]

In [None]:
def get_embedding(text, dimensions, model=EMBEDDING_MODEL):
    """Call OpenAI to get an embedding for the given text."""
    try:
        response = openai.embeddings.create(
            input=text,
            model=model,
            dimensions=dimensions
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"❌ Error generating embedding: {e}")
        return None

In [None]:
def query_embedding(question_embedding):
    """Queries the database to find the most relevant summaries."""
    # The query finds the top 3 most similar summaries using cosine similarity (<=>)
    # The trailing comma in `params=(question_embedding,)` is crucial to ensure it's a tuple.
    responses = execute_query(conn, """
        SELECT idurl, summary_data ->> 'summary_text' as summary
        FROM tblurls
        WHERE summary_data IS NOT NULL
        ORDER BY embedding <=> %s::vector
        LIMIT 3
    """, params=(question_embedding,), fetch=True)
    
    # Return the database rows
    return responses

In [None]:
def build_rag_prompt(question, context_summaries):
    """Builds the prompt for the LLM to answer the question based on context."""
    
    # Joins the retrieved summaries into a single block of text
    context_str = "\n\n---\n\n".join(context_summaries)
    
    prompt = f"""
You are an expert AI assistant for the CIROH DocuHub. Your task is to answer the user's question based *only* on the provided context.

If the context does not contain the answer, state that you cannot answer the question with the information given. Do not use any external knowledge.

**CONTEXT:**
---
{context_str}
---

**QUESTION:**
{question}

**ANSWER:**
"""
    return prompt

In [None]:
# Get the final answer from the LLM
def get_rag_answer(prompt):
    """Calls the LLM to generate the final answer."""
    try:
        response = client.chat.completions.create(
            model="gpt-5", # Or your preferred model
            messages=[
                {"role": "user", "content": prompt}
            ],
            reasoning_effort="minimal",
            verbosity="low"
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"❌ Error generating RAG answer: {e}")
        return None


In [None]:
for i, question in enumerate(questions):
    print(f"--- \n\n❓ PROCESSING QUESTION {i+1}: {question}\n")
        
    # 1. Get the embedding for the current question
    question_embedding = get_embedding(question, dimensions=1792)
        
    if question_embedding is None:
        print("Could not generate embedding for the question. Skipping.")
        continue
            
    # 2. Query the database to get relevant context
    retrieved_context = query_embedding(question_embedding)
        
    if not retrieved_context:
        print("No relevant context found in the database. Skipping.")
        continue
            
    # Extract just the summary texts for the prompt
    context_ids = [row[0] for row in retrieved_context]
    context_summaries = [row[1] for row in retrieved_context]
        
    print("📚 Retrieved Context:")
    for summary in context_summaries:
        print(f"- {summary[:120]}...") # Print a snippet of each summary
    print()

    source_breadcrumbs = []
    for url_id in context_ids:
        breadcrumb = get_breadcrumb(conn, url_id)
        if breadcrumb:
            source_breadcrumbs.append(breadcrumb)

    # 3. Build the RAG prompt
    rag_prompt = build_rag_prompt(question, context_summaries)
        
    # 4. Get the final answer from the LLM
    final_answer = get_rag_answer(rag_prompt)
        
    if final_answer:
        print(f"✅ Final Answer:\n{final_answer}\n")

        if source_breadcrumbs:
            print("🔗 Sources:")
            unique_trails = list(dict.fromkeys(source_breadcrumbs))
            for trail in unique_trails:
                print(f"- {trail}")
            print()


In [None]:
# Close the database connection
conn.close()