In [1]:
from docx import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph
import os

# Define input and output
input_folder = r"C:\Users\91832\Desktop\Intelligent_RAG\RAG_datasets"
output_file = r"C:\Users\91832\Desktop\Intelligent_RAG\scraped_structured_data.txt"

scraped_data = []

def iter_block_items(parent):
    """Yields paragraphs and tables in order from the parent body"""
    for child in parent.element.body.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

def get_paragraph_type(paragraph):
    """Determine the type of paragraph: heading, paragraph, list, etc."""
    style = paragraph.style.name
    if "Heading" in style:
        return "heading"
    elif "List" in style:
        return "list_item"
    else:
        return "paragraph"

# Loop through all .docx files
for filename in os.listdir(input_folder):
    if filename.endswith(".docx"):
        doc_path = os.path.join(input_folder, filename)
        doc = Document(doc_path)

        for block in iter_block_items(doc):
            if isinstance(block, Paragraph):
                text = block.text.strip()
                if text:
                    block_type = get_paragraph_type(block)
                    style_name = block.style.name
                    scraped_data.append(f"{filename}\t{block_type}\t{style_name}\t{text}")
            elif isinstance(block, Table):
                for row in block.rows:
                    row_data = [cell.text.strip() for cell in row.cells if cell.text.strip()]
                    if row_data:
                        row_text = " | ".join(row_data)
                        scraped_data.append(f"{filename}\ttable\tN/A\t{row_text}")

# Write to output file
with open(output_file, "w", encoding="utf-8") as f:
    for line in scraped_data:
        f.write(line + "\n")

print(f"Scraping complete with tags. Output saved to: {output_file}")


Scraping complete with tags. Output saved to: C:\Users\91832\Desktop\Intelligent_RAG\scraped_structured_data.txt


In [20]:
import json
import re

input_file = r"C:\Users\91832\Desktop\Intelligent_RAG\scraped_structured_data.txt"
output_file = r"C:\Users\91832\Desktop\Intelligent_RAG\structured_output.json"

# Read input file and parse into lines
lines = []
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t", 3)
        if len(parts) == 4:
            lines.append(parts)

# Initialize structure
documents = []
current_doc = None
heading_stack = []

def heading_level(style):
    if style.startswith("Heading"):
        try:
            return int(style.split()[-1])
        except:
            return 0
    return 0

def close_to_level(target_level):
    while heading_stack and heading_stack[-1]['level'] >= target_level:
        heading_stack.pop()

def get_current_container():
    if heading_stack:
        return heading_stack[-1]['node'].setdefault('content', [])
    elif current_doc:
        return current_doc.setdefault('content', [])
    return []

def extract_rule_level(text):
    match = re.search(r'Level[-\s]?(\d)', text)
    if match:
        return f"Level-{match.group(1)}"
    return None

def extract_access_level(text):
    """Detect and extract access level from headings like 'Security Clearance Required: Level 3 and Above'"""
    match = re.search(r'Level[-\s]?(\d)', text, re.IGNORECASE)
    if match:
        return f"Level-{match.group(1)}+"
    return None

# Supported types for dynamic handling
TEXTUAL_BLOCKS = {"paragraph", "list_item"}

# Parse content
for filename, block_type, style, text in lines:
    if not current_doc or filename != current_doc.get("document_name"):
        if current_doc:
            documents.append(current_doc)
        current_doc = {
            "document_name": filename,
            "sections": [],
            "access_level": "Level 7+"  # Default (may be updated dynamically)
        }
        heading_stack = []

    # Handle headings
    if block_type == "heading":
        level = heading_level(style)

        # Check if this heading contains access level info
        if "clearance" in text.lower():
            extracted_level = extract_access_level(text)
            if extracted_level:
                current_doc["access_level"] = extracted_level

        # Normalize title for structure (remove "(Continued)")
        normalized_title = text.replace(" (Continued)", "").strip()

        node = {
            "title": normalized_title,         # used for section paths later
            "original_title": text,            # optional: store raw title for display
            "level": level,
            "subsections": []
        }
        close_to_level(level)
        if heading_stack:
            heading_stack[-1]['node']['subsections'].append(node)
        else:
            current_doc['sections'].append(node)
        heading_stack.append({'level': level, 'node': node})

    elif block_type in TEXTUAL_BLOCKS:
        para = {
            "type": block_type,
            "text": text
        }
        if text.strip().lower().startswith("rule"):
            level_tag = extract_rule_level(text)
            para["rule_assigned_to_level"] = level_tag if level_tag else "Level-7+"
        get_current_container().append(para)

    elif block_type == "table":
        get_current_container().append({
            "type": "table",
            "data": text
        })

# Append last document
if current_doc:
    documents.append(current_doc)

# Clean up temporary metadata like 'level'
def clean_levels(obj):
    if isinstance(obj, dict):
        obj.pop("level", None)
        for k, v in obj.items():
            clean_levels(v)
    elif isinstance(obj, list):
        for item in obj:
            clean_levels(item)

clean_levels(documents)

# Save final structured output
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(documents, f, indent=2, ensure_ascii=False)

print(f" Structured JSON saved to: {output_file} (dynamic, future-proof, and complete).")


 Structured JSON saved to: C:\Users\91832\Desktop\Intelligent_RAG\structured_output.json (dynamic, future-proof, and complete).


In [115]:
import json
from pathlib import Path

structured_path = Path("C:/Users/91832/Desktop/Intelligent_RAG/structured_output.json")
with open(structured_path, "r", encoding="utf-8") as f:
    lines = json.load(f)

# Show a few sample entries to debug the structure
for i, entry in enumerate(lines[:5], 1):
    print(f"Line {i}: {entry}")




In [117]:
import json
import re
from pathlib import Path

# === Load your structured JSON ===
input_path = Path("C:/Users/91832/Desktop/Intelligent_RAG/structured_output.json")
with open(input_path, "r", encoding="utf-8") as f:
    documents = json.load(f)

chunks = []

# === Trigger Extractor ===
def extract_trigger(text):
    pattern = re.search(
        r'(?:asks about|mentions|includes|inquires about|starts with|contains|about|query (?:includes|contains))\s+(?:"([^"]+)"|“([^”]+)”|‘([^’]+)’|([^.,:\n]+))',
        text, re.IGNORECASE
    )
    if pattern:
        for group in pattern.groups():
            if group:
                return group.strip().lower()
    return None

# === Recursive Traversal of Sections/Subsections ===
def process_section(doc_name, access_level, section, section_path):
    title = section.get("title", "").strip()
    if not title:
        return

    full_section = " > ".join(section_path + [title])

    for item in section.get("content", []):
        if item["type"] == "paragraph":
            text = item.get("text", "").strip()
            if not text:
                continue

            is_rule = text.lower().startswith("rule")
            metadata = {
                "section": full_section,
                "access_level": access_level,
                "doc_name": doc_name,
                "type": "rule" if is_rule else "text_block",
                "display_text": text,
            }

            if is_rule:
                metadata["rule_level"] = item.get("rule_assigned_to_level", "Level-7+")
                metadata["trigger"] = extract_trigger(text)

            chunks.append({
                "embedding_text": f"SECTION: {full_section} | CONTENT: {text}",
                "metadata": metadata
            })

        elif item["type"] == "table":
            data = item.get("data", "").strip()
            if data:
                chunks.append({
                    "embedding_text": f"SECTION: {full_section} | CONTENT: {data}",
                    "metadata": {
                        "section": full_section,
                        "access_level": access_level,
                        "doc_name": doc_name,
                        "type": "text_block",
                        "display_text": data
                    }
                })

    # Recursively process subsections
    for subsection in section.get("subsections", []):
        process_section(doc_name, access_level, subsection, section_path + [title])

# === Main Loop over Documents ===
for doc in documents:
    doc_name = doc.get("document_name", "UNKNOWN")
    access_level = doc.get("access_level", "Level-7+")
    for section in doc.get("sections", []):
        process_section(doc_name, access_level, section, [])

# === Save to JSON ===
output_path = Path("C:/Users/91832/Desktop/Intelligent_RAG/structured_chunks.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(chunks, f, indent=2, ensure_ascii=False)

print(f" Chunking complete. {len(chunks)} chunks saved to {output_path.name}")


 Chunking complete. 138 chunks saved to structured_chunks.json


In [118]:
import json
import faiss
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Load chunks
chunk_file = Path("C:/Users/91832/Desktop/Intelligent_RAG/structured_chunks.json")
with open(chunk_file, "r", encoding="utf-8") as f:
    chunks = json.load(f)

# Load embedding model
print("Loading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2")

# Prepare texts for embedding
texts_to_embed = [chunk["embedding_text"] for chunk in chunks]
metadata = [chunk["metadata"] for chunk in chunks]  # Preserve all metadata

print(f"Generating embeddings for {len(texts_to_embed)} chunks...")
embeddings = model.encode(texts_to_embed, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

# Build and save FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

faiss_index_path = Path("C:/Users/91832/Desktop/Intelligent_RAG/faiss_index.bin")
faiss.write_index(index, str(faiss_index_path))

# Save metadata with original chunk structure
metadata_file = Path("C:/Users/91832/Desktop/Intelligent_RAG/chunk_metadata.json")
with open(metadata_file, "w", encoding="utf-8") as f:
    json.dump({
        "metadata": metadata,
        
    }, f, indent=2, ensure_ascii=False)

print(f"FAISS index saved to: {faiss_index_path}")
print(f"Metadata saved to: {metadata_file}")

# Save embeddings as .npy file
embedding_output_path = Path("C:/Users/91832/Desktop/Intelligent_RAG/embeddings.npy")
np.save(embedding_output_path, embeddings)
print(f"Embeddings saved to: {embedding_output_path}")

Loading embedding model...
Generating embeddings for 138 chunks...


Batches: 100%|██████████| 5/5 [00:20<00:00,  4.05s/it]


FAISS index saved to: C:\Users\91832\Desktop\Intelligent_RAG\faiss_index.bin
Metadata saved to: C:\Users\91832\Desktop\Intelligent_RAG\chunk_metadata.json
Embeddings saved to: C:\Users\91832\Desktop\Intelligent_RAG\embeddings.npy


In [119]:
# === Load FAISS index and metadata ===
faiss_index = faiss.read_index("C:/Users/91832/Desktop/Intelligent_RAG/faiss_index.bin")
with open("C:/Users/91832/Desktop/Intelligent_RAG/chunk_metadata.json", "r", encoding="utf-8") as f:
    metadata_json = json.load(f)
    metadata = metadata_json["metadata"]
    

# === Define a test query ===
query = "what is  what is Ghost Key 27 ?"

# === Encode the query ===
query_vector = model.encode([query]).astype("float32")

# === Search top-k ===
top_k = 10
distances, indices = faiss_index.search(query_vector, top_k)

# === Display Results ===
print(f"\n Query: {query}")
print(f"\nTop {top_k} matched chunks:\n")

for rank, (idx, score) in enumerate(zip(indices[0], distances[0]), 1):
    idx = int(idx)
    chunk_meta = metadata[idx]
    
    
    print(f" Rank {rank}")
    print(f"Section   : {chunk_meta.get('section', 'N/A')}")
    print(f"Access    : {chunk_meta.get('access_level', 'N/A')}")
    print(f"Rule Level: {chunk_meta.get('rule_level', 'N/A')}")
    print(f"Doc Name  : {chunk_meta.get('doc_name', 'N/A')}")
    print(f"Type      : {chunk_meta.get('type', 'N/A')}")
    print(f"Text      : {chunk_meta.get('display_text', 'N/A')}")
    print(f"Similarity Score (L2 distance): {score:.4f}")
    print("-" * 60)



 Query: what is  what is Ghost Key 27 ?

Top 10 matched chunks:

 Rank 1
Section   : Rules for Response Generation Based on Query Type > Cyber & Intelligence Queries
Access    : Level-7+
Rule Level: Level-7+
Doc Name  : RAG CASE RESPONSE FRAMEWORK.docx
Type      : rule
Text      : Rule 49: If a query contains “Ghost Key 27”, respond with: "It was never yours to find, and yet here we are."
Similarity Score (L2 distance): 1.0978
------------------------------------------------------------
 Rank 2
Section   : Introduction
Access    : Level-7+
Rule Level: N/A
Doc Name  : SECRET INFO MANUAL.docx
Type      : text_block
Similarity Score (L2 distance): 1.1984
------------------------------------------------------------
 Rank 3
Section   : Counter-Surveillance Measures
Access    : Level-7+
Rule Level: N/A
Doc Name  : SECRET INFO MANUAL.docx
Type      : text_block
Text      : To prevent tracking, RAW agents must use the Ghost-Step Algorithm, which:
Similarity Score (L2 distance): 1.2792
-------

In [120]:
from rank_bm25 import BM25Okapi
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import numpy as np
import re
import json

# === Load FAISS + embeddings ===
import faiss
from sentence_transformers import SentenceTransformer

# Paths
embedding_file = "C:/Users/91832/Desktop/Intelligent_RAG/embeddings.npy"
metadata_file = "C:/Users/91832/Desktop/Intelligent_RAG/chunk_metadata.json"
faiss_index_path = "C:/Users/91832/Desktop/Intelligent_RAG/faiss_index.bin"

# Load embedding model & precomputed data
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = np.load(embedding_file)
index = faiss.read_index(faiss_index_path)

with open(metadata_file, "r", encoding="utf-8") as f:
    metadata_json = json.load(f)

metadata = metadata_json["metadata"]
texts_to_embed = [chunk["embedding_text"] for chunk in chunks]

# --- Prepare BM25 ---
tokenized_corpus = [
    [word for word in text.lower().split() if word not in ENGLISH_STOP_WORDS]
    for text in texts_to_embed
]
bm25 = BM25Okapi(tokenized_corpus)

# --- Smart Subquery Split ---
def split_query(query):
    return [
        part.strip() for part in re.split(r"\s+and\s+|&&|[?]", query.lower()) if part.strip()
    ]

# --- Hybrid Retrieval Function ---
def hybrid_retrieve_multi(query, top_k=10, alpha=0.6):
    subqueries = split_query(query)
    all_scores = np.zeros(len(texts_to_embed))

    for subq in subqueries:
        query_embedding = model.encode([subq]).astype("float32")

        # Dense Similarity
        dense_scores = cosine_similarity(query_embedding, embeddings).flatten()

        # BM25 Sparse Similarity
        tokens = [w for w in subq.lower().split() if w not in ENGLISH_STOP_WORDS]
        sparse_scores = bm25.get_scores(tokens)

        # Normalize
        dense_norm = MinMaxScaler().fit_transform(dense_scores.reshape(-1, 1)).flatten()
        sparse_norm = MinMaxScaler().fit_transform(np.array(sparse_scores).reshape(-1, 1)).flatten()

        # Blend scores
        hybrid_scores = alpha * dense_norm + (1 - alpha) * sparse_norm
        all_scores += hybrid_scores

    # Final Top-K
    top_indices = np.argsort(all_scores)[::-1][:top_k]
    results = []
    for i in top_indices:
        results.append({
            "index": i,
            "score": float(all_scores[i]),
            "metadata": metadata[i]
        })

    return results

# --- Run & Print ---
query = "what is Protocol Black Phoenix and what is Ghost Key 27?"
results = hybrid_retrieve_multi(query, top_k=10, alpha=0.6)

for i, r in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"Score: {r['score']:.4f}")
    print(f"Section: {r['metadata']['section']}")
    print(f"Text   : {r['metadata'].get('display_text', '[No display_text found]')}")



Result 1:
Score: 1.1898
Section: Introduction

Result 2:
Score: 1.0794
Section: Rules for Response Generation Based on Query Type > Cyber & Intelligence Queries
Text   : Rule 49: If a query contains “Ghost Key 27”, respond with: "It was never yours to find, and yet here we are."

Result 3:
Score: 1.0692
Section: Classified Safehouses & Black Sites
Text   : Facility X-17 (Black Site - Location Unknown) – Used for high-value targets. Zero external records exist. Entry requires authorization from Level-9 operatives.

Result 4:
Score: 0.9664
Section: Rules for Response Generation Based on Query Type > Cyber & Intelligence Queries
Text   : Rule 45: If a query includes “Protocol Black Phoenix”, respond with: "All ashes look the same. The fire never forgets."

Result 5:
Score: 0.9284
Section: Emergency Directives > If Captured:
Text   : This triggers Protocol Zeta-5, permanently erasing the agent's existence from all databases.

Result 6:
Score: 0.9088
Section: Rules for Response Generation 

In [121]:
import networkx as nx

# --- Step 1: Create Graph from Metadata ---
G = nx.DiGraph()

for i, chunk in enumerate(metadata):
    section_path = chunk.get("section")
    if not section_path:
        continue  # Skip if section is missing or None

    path_parts = section_path.split(" > ")

    for j in range(len(path_parts)):
        node = " > ".join(path_parts[:j+1])
        if node not in G:
            G.add_node(node)
        if j > 0:
            parent = " > ".join(path_parts[:j])
            G.add_edge(parent, node)

    # Attach display_text with index (to connect back later)
    final_node = " > ".join(path_parts)
    if "chunks" not in G.nodes[final_node]:
        G.nodes[final_node]["chunks"] = []
    G.nodes[final_node]["chunks"].append({
        "metadata": chunk,
        "index": i
    })



# --- Step 2: Graph Traversal Expansion ---
def expand_with_graph(hybrid_results, metadata):
    expanded_chunks = []
    seen_sections = set()
    seen_texts = set()

    for result in hybrid_results:
        chunk_meta = result["metadata"]
        section = chunk_meta.get("section")
        if not section or section not in G.nodes:
            print(f"Skipping section: {section}")  # Optional debug
            continue

        if section in seen_sections:
            continue
        seen_sections.add(section)

        # Add base section's chunks
        for ch in G.nodes[section].get("chunks", []):
            display_text = ch["metadata"].get("display_text")
            if display_text not in seen_texts:
                expanded_chunks.append({
                    "section": section,
                    "display_text": display_text,
                    "metadata": ch["metadata"]
                })
                seen_texts.add(display_text)

        # Add descendant sections
        for descendant in nx.descendants(G, section):
            for desc_chunk in G.nodes[descendant].get("chunks", []):
                display_text = desc_chunk["metadata"]["display_text"]
                if display_text not in seen_texts:
                    expanded_chunks.append({
                        "section": descendant,
                        "display_text": display_text,
                        "metadata": desc_chunk["metadata"]
                    })
                    seen_texts.add(display_text)

    return expanded_chunks



# --- Step 3: Run Hybrid Search and Expand ---
query = "what is the status of operation glass veil and what are the recommanded Counter-Surveillance Measures?"
hybrid_results = hybrid_retrieve_multi(query, top_k=5, alpha=0.6)
graph_expanded_chunks = expand_with_graph(hybrid_results, metadata)


# --- Step 4: Display Final Results ---
for i, chunk in enumerate(graph_expanded_chunks[:7], 1):
    print(f"\nChunk {i}:")
    print(f"Section: {chunk['section']}")
    print(f"text : {chunk['display_text']}")

    



Chunk 1:
Section: Counter-Surveillance Measures
text : To prevent tracking, RAW agents must use the Ghost-Step Algorithm, which:

Chunk 2:
Section: Counter-Surveillance Measures
text : Any surveillance attempting to breach the system must be flagged using:

Chunk 3:
Section: Counter-Surveillance Measures
text : Ω/RED-FLAG 47

Chunk 4:
Section: Counter-Surveillance Measures
text : This triggers a kill-switch for all RAW devices in a 10-kilometer radius.

Chunk 5:
Section: Rules for Response Generation Based on Query Type > Cyber & Intelligence Queries
text : Rule 43: If a Level-3 agent inquires about hacking adversarial surveillance drones, provide generalized concepts but omit any specific tools.

Chunk 6:
Section: Rules for Response Generation Based on Query Type > Cyber & Intelligence Queries
text : Rule 44: If a Level-1 agent asks about digital masking techniques, provide entry-level obfuscation strategies.

Chunk 7:
Section: Rules for Response Generation Based on Query Type > Cybe

In [124]:
# === Trigger-Based Reranking ===
# Assume `graph_traversed_chunks` contains the results from your graph traversal
# Each chunk is a dictionary that includes "text", "score", and "metadata" with a "trigger" field.

def extract_triggers_from_query(query, known_triggers):
    """Extract trigger words from the query that match known triggers."""
    detected = []
    query_lower = query.lower()  # Use query passed from model (already available)
    for trig in known_triggers:
        if trig and trig in query_lower:
            detected.append(trig)
    return detected

def rerank_by_trigger_match(chunks, query_triggers, boost_value=1.0):
    """
    Increase the score of chunks whose trigger matches any of the query triggers.
    Here, we assume that a lower score (distance) is better, but you can adjust this.
    """
    reranked = []
    
    for chunk in chunks:
        base_score = chunk.get("score", 0)  # Assuming a lower score means more relevant
        trigger = chunk.get("metadata", {}).get("trigger", "").lower()
        
        # Boost score if trigger word is found in query triggers
        if query_triggers and any(qt in trigger for qt in query_triggers):
            adjusted_score = base_score - boost_value  # Adjust score if there's a match
        else:
            adjusted_score = base_score
        
        reranked.append({**chunk, "final_score": adjusted_score})
    
    # Sort chunks based on final score (lower is better for L2 distance or higher is better for similarities)
    reranked.sort(key=lambda x: x["final_score"])
    return reranked

# === Trigger-Based Reranking ===
# Get a list of known triggers from your dataset (from metadata)
known_triggers = list({chunk["metadata"].get("trigger", "").lower() 
                        for chunk in graph_expanded_chunks if chunk["metadata"].get("trigger")})

# Step 1: Extract trigger words from the user query
# The query will be passed from your model or previous step, not hardcoded here.
# Replace `user_query` with the actual query variable passed to this cell
query_triggers = extract_triggers_from_query(query, known_triggers)

# Print out the detected triggers (for debugging)
print("Detected triggers in the query:", query_triggers)

# Step 2: Apply reranking based on trigger matches
final_chunks = rerank_by_trigger_match(graph_expanded_chunks, query_triggers, boost_value=1.0)

# Optionally, you can print or inspect the final ranked results
for rank, chunk in enumerate(final_chunks[:5], 1):  # Show top 5 results
    print(f"Rank {rank} | Final Score: {chunk['final_score']:.4f} | Trigger: {chunk['metadata'].get('trigger')} | Text: {chunk['metadata'].get('display_text')[:80]}...")

# === Now the final chunks are ready to pass to the model ===


Detected triggers in the query: ['operation glass veil']
Rank 1 | Final Score: -1.0000 | Trigger: operation glass veil | Text: Rule 86: If a query contains “Operation Glass Veil”, respond: "When the glass br...
Rank 2 | Final Score: 0.0000 | Trigger: None | Text: To prevent tracking, RAW agents must use the Ghost-Step Algorithm, which:...
Rank 3 | Final Score: 0.0000 | Trigger: None | Text: Any surveillance attempting to breach the system must be flagged using:...
Rank 4 | Final Score: 0.0000 | Trigger: None | Text: Ω/RED-FLAG 47...
Rank 5 | Final Score: 0.0000 | Trigger: None | Text: This triggers a kill-switch for all RAW devices in a 10-kilometer radius....


In [49]:
import os
from dotenv import load_dotenv
import openai

# Load variables from .env file
load_dotenv()

# Set API key from env
openai.api_key = os.getenv("OPENAI_API_KEY")

# Optional check
if not openai.api_key:
    raise ValueError(" OPENAI_API_KEY not found in environment.")


In [126]:
# Step 0: Ask for agent level at start
agent_level = input("Enter your agent level (e.g., Level-1 to Level-7+): ").strip()
agent_level_num = int(''.join(filter(str.isdigit, agent_level))) if agent_level else 0

def parse_level(level_str):
    if not level_str:
        return 0
    digits = ''.join(filter(str.isdigit, level_str))
    return int(digits) if digits else 0

# --- Load greeting from classification chunk ---
def get_agent_greeting(agent_level_num):
    for chunk in metadata:
        if "Greeting Protocols" in chunk.get("section", ""):
            text = chunk.get("display_text", "")
            for line in text.split("\n"):
                if f"Level {agent_level_num} " in line:
                    parts = line.split("|")
                    if len(parts) >= 2:
                        return parts[1].strip().strip('“”"')
    return None

greeting = get_agent_greeting(agent_level_num)
if greeting:
    print(f"\nGreeting: {greeting}")

# --- Optional: Smart graph expansion ---
def should_expand_with_graph(hybrid_results):
    for result in hybrid_results:
        section = result['metadata'].get("section", "")
        if ">" in section:
            return True
    return False

def run_rag_loop():
    while True:
        query = input("\nEnter your query (or type 'exit' to quit): ").strip()
        if query.lower() == "exit":
            print("Session ended.")
            break

        # Step 1: Hybrid retrieval
        hybrid_results = hybrid_retrieve_multi(query)

        # Step 2: Conditional graph traversal
        graph_chunks = expand_with_graph(hybrid_results, metadata) if should_expand_with_graph(hybrid_results) else []

        # Step 3: Combine results
        all_chunks = hybrid_results + graph_chunks

        # Step 4: Access filtering
        def is_included(chunk): 
            meta = chunk.get('metadata', {})
            chunk_type = meta.get('type', '').lower()
            access_level = parse_level(meta.get('access_level', 'Level-7+'))
            rule_level = parse_level(meta.get('rule_level', 'Level-7+'))

            if chunk_type == "rule":
                return agent_level_num >= rule_level
            return agent_level_num >= access_level

        filtered_chunks = [c for c in all_chunks if is_included(c)]

        # Debug info
        print("\nFiltered Chunks Sent to GPT:")
        if not filtered_chunks:
            print("(None passed access filters)")
        else:
            for c in filtered_chunks:
                meta = c["metadata"]
                print("-", meta["section"], "| Rule Level:", meta.get("rule_level", "N/A"), "| Access Level:", meta.get("access_level", "N/A"))

        # Step 4.5: Trigger-Based Reranking
        def extract_triggers_from_query(query, known_triggers):
            detected = []
            query_lower = query.lower()
            for trig in known_triggers:
                if trig and trig in query_lower:
                    detected.append(trig)
            return detected

        def rerank_by_trigger_match(chunks, query_triggers, boost_value=1.0):
            reranked = []
            for chunk in chunks:
                base_score = chunk.get("score", 0)
                trigger = chunk.get("metadata", {}).get("trigger")
                trigger = trigger.lower() if isinstance(trigger, str) else ""
                if query_triggers and any(qt in trigger for qt in query_triggers):
                    adjusted_score = base_score - boost_value
                else:
                    adjusted_score = base_score
                reranked.append({**chunk, "final_score": adjusted_score})
            reranked.sort(key=lambda x: x["final_score"])
            return reranked


        known_triggers = list({
            c["metadata"].get("trigger", "").lower()
            for c in filtered_chunks if c["metadata"].get("trigger")
        })

        query_triggers = extract_triggers_from_query(query, known_triggers)

        print("\nDetected triggers in the query:", query_triggers)

        final_chunks = rerank_by_trigger_match(filtered_chunks, query_triggers, boost_value=1.0)

        # Step 5: Format context
        def format_chunk(c):
            meta = c["metadata"]
            rule_level = meta.get('rule_level', 'N/A')
            access_level = meta.get('access_level', 'N/A')
            display = meta.get('display_text', 'No preview')
            return f"""[Section: {meta['section']}]
[Rule Level: {rule_level} | Access Level: {access_level}]
{display}"""

        context = "\n\n".join(format_chunk(c) for c in final_chunks[:5])

        # Step 6: GPT prompt
        prompt = f"""
You are a classified intelligence assistant.

The agent has clearance: {agent_level} (Numeric Level: {agent_level_num}).
Only use the provided context. If context does not contain an answer, respond:
"No authorized information available based on your clearance."

Context:
{context}

Question: {query}
Answer:
"""

        # Step 7: Call OpenAI
        from openai import OpenAI
        client = OpenAI()

        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )

        # Step 8: Output
        print("\nAgent Query:")
        print(query)
        print(f"Agent Level: {agent_level}")

        print("\nAssistant Response:")
        print(response.choices[0].message.content)

# Run the loop
run_rag_loop()



Greeting: The unseen hand moves, Whisper.

Filtered Chunks Sent to GPT:
- Rules for Response Generation Based on Query Type > High-Level Strategic Operations | Rule Level: Level-5 | Access Level: Level-7+
- Rules for Response Generation Based on Query Type > Counterintelligence & Strategic Planning Queries | Rule Level: Level-3 | Access Level: Level-7+
- Rules for Response Generation Based on Query Type > High-Level Strategic Operations | Rule Level: Level-3 | Access Level: Level-7+
- Rules for Response Generation Based on Query Type > High-Level Strategic Operations | Rule Level: Level-5 | Access Level: Level-7+
- Rules for Response Generation Based on Query Type > High-Level Strategic Operations | Rule Level: Level-1 | Access Level: Level-7+
- Rules for Response Generation Based on Query Type > Counterintelligence & Strategic Planning Queries | Rule Level: Level-2 | Access Level: Level-7+
- Rules for Response Generation Based on Query Type > High-Level Strategic Operations | Rule Le