In [1]:
!pip install openai



In [2]:
!pip install -q openai google-api-python-client google-auth google-auth-httplib2 \tenacity pandas sentence-transformers faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import io
import json
import time
import getpass
import logging
from typing import List, Dict, Any

import pandas as pd
import numpy as np
from google.colab import drive
# Embeddings model
from sentence_transformers import SentenceTransformer
import faiss
from openai import OpenAI
# Simple retry helper
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

In [4]:
os.environ["OPENAI_API_KEY"] = "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"

In [5]:
client = OpenAI()

# Test if the key works
print(client.models.list())

SyncPage[Model](data=[Model(id='gpt-4o-mini', created=1721172741, object='model', owned_by='system'), Model(id='gpt-4.1-mini-2025-04-14', created=1744317547, object='model', owned_by='system'), Model(id='gpt-4.1-mini', created=1744318173, object='model', owned_by='system'), Model(id='gpt-4.1-nano', created=1744321707, object='model', owned_by='system')], object='list')


In [8]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger("mitigation_agent")

# Default variables - change paths if needed
SRC = "/content/drive/MyDrive/Data for Cybermind/MitigationAgent.csv"
OUT_CLEAN = "/content/drive/MyDrive/Data for Cybermind/vulns_clean_for_mitigation.csv"
EMBEDDINGS_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")  # change if you prefer another model
TOP_K = 3  # number of similar documents to fetch

In [7]:
print("Mounting Google Drive...")
drive.mount("/content/drive", force_remount=False)
print("Drive mounted.")

Mounting Google Drive...
Mounted at /content/drive
Drive mounted.


In [9]:
if not os.path.exists(SRC):
    raise FileNotFoundError(f"Source file not found at: {SRC}\nPlease verify the path in SRC variable.")

print("Found source file:", SRC)

Found source file: /content/drive/MyDrive/Data for Cybermind/MitigationAgent.csv


In [10]:
df = pd.read_csv(SRC)
print(f"Loaded rows: {len(df)} | Columns: {df.columns.tolist()[:50]}")

# Select columns we expect (safely)
selected = [
    "id", "title", "description", "cvss_score", "final_severity",
    "affected_products", "technique_id", "technique_name",
    "published_date", "references"
]
existing = [c for c in selected if c in df.columns]
missing = [c for c in selected if c not in df.columns]
print("Existing cols:", existing)
print("Missing cols (will be filled with defaults):", missing)

clean_df = df.copy()
# Fill missing columns with sensible defaults
for col in missing:
    if col in ["cvss_score"]:
        clean_df[col] = 0
    elif col in ["published_date"]:
        clean_df[col] = ""
    else:
        clean_df[col] = ""

# Fill NaNs for important fields
clean_df = clean_df.fillna({
    "description": "No description provided",
    "cvss_score": 0,
    "final_severity": "Unknown",
    "affected_products": "Unknown",
    "references": ""
})

# Save cleaned file back to Drive (optional)
clean_df.to_csv(OUT_CLEAN, index=False)
print("Clean file saved to:", OUT_CLEAN)

Loaded rows: 2444 | Columns: ['id', 'title', 'description', 'cvss_score', 'severity', 'published_date', 'last_modified', 'affected_products', 'references', 'ingested_at', 'original_severity_raw', 'severity_text_norm', 'final_severity', 'severity_source', 'has_cvss', 'technique_id', 'technique_name', 'tech_match_score']
Existing cols: ['id', 'title', 'description', 'cvss_score', 'final_severity', 'affected_products', 'technique_id', 'technique_name', 'published_date', 'references']
Missing cols (will be filled with defaults): []
Clean file saved to: /content/drive/MyDrive/Data for Cybermind/vulns_clean_for_mitigation.csv


In [11]:
 #Prepare documents (plain text blobs)
documents: List[str] = []
metadata: List[Dict[str, Any]] = []

for _, row in clean_df.iterrows():
    doc_text = (
        f"Vulnerability ID: {row.get('id', 'N/A')}\n"
        f"Title: {row.get('title', 'N/A')}\n"
        f"Description: {row.get('description', 'N/A')}\n"
        f"CVSS Score: {row.get('cvss_score', 0)}\n"
        f"Severity: {row.get('final_severity', 'Unknown')}\n"
        f"Affected Products: {row.get('affected_products', 'Unknown')}\n"
        f"Technique: {row.get('technique_name', 'Unknown')} ({row.get('technique_id', 'N/A')})\n"
        f"References: {row.get('references', '')}\n"
    )
    documents.append(doc_text)
    metadata.append({
        "id": row.get("id", None),
        "title": row.get("title", None),
        "published_date": row.get("published_date", None)
    })

print(f"Prepared {len(documents)} documents.")

Prepared 2444 documents.


In [12]:
print("Loading sentence-transformers model for embeddings...")
embed_model = SentenceTransformer(EMBEDDINGS_MODEL_NAME)

# Compute embeddings in batches to avoid OOM
batch_size = 64
all_embeddings = []
for i in range(0, len(documents), batch_size):
    batch = documents[i:i+batch_size]
    emb = embed_model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
    all_embeddings.append(emb)
all_embeddings = np.vstack(all_embeddings).astype("float32")

# Normalize embeddings for cosine similarity using inner product
faiss.normalize_L2(all_embeddings)

dim = all_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # inner product index (with normalized vectors -> cosine)
index.add(all_embeddings)
print("FAISS index built with", index.ntotal, "vectors.")

# Helper mapping from faiss id -> document/metdata
# faiss internal ids are 0..N-1 in the same order
def similarity_search(query: str, k: int = TOP_K):
    """
    Embed the query, search FAISS and return top-k documents (text + metadata + score).
    """
    q_emb = embed_model.encode([query], convert_to_numpy=True).astype("float32")
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, k)
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx < 0:
            continue
        results.append({
            "score": float(score),
            "document": documents[idx],
            "metadata": metadata[idx]
        })
    return results

Loading sentence-transformers model for embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index built with 2444 vectors.


In [13]:
if "OPENAI_API_KEY" not in os.environ or not os.environ.get("OPENAI_API_KEY"):
    print("\nPlease enter your OpenAI API key. It will not be printed to the notebook.")
    key = getpass.getpass("OPENAI_API_KEY (hidden): ")
    os.environ["OPENAI_API_KEY"] = key

openai_api_key = os.environ.get("OPENAI_API_KEY")
if not openai_api_key:
    raise RuntimeError("OPENAI_API_KEY is required. Set it as an environment variable or enter it when prompted.")

client = OpenAI(api_key=openai_api_key)
print("OpenAI client initialized with model:", OPENAI_MODEL)

OpenAI client initialized with model: gpt-4o-mini


In [17]:
# OpenAI call wrapper with retries
@retry(stop_after_attempt(4), wait_exponential(multiplier=1, min=1, max=10))
def call_openai(prompt: str, model: str = OPENAI_MODEL, max_tokens: int = 400, temperature: float = 0.3) -> str:
    """
    Call OpenAI Chat Completions API and return the output text.
    Retries on transient errors.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        max_tokens=max_tokens,
        temperature=temperature,
        # *** JSON Request Modification ***
        response_format={"type": "json_object"}
    )

    # Return the assistant's reply
    return response.choices[0].message.content

In [18]:
#Mitigation agent function (uses FAISS context + OpenAI)

def mitigation_agent(query: str, top_k: int = TOP_K) -> str:
    """
    Given a user query, fetch top-k similar vulnerability documents, build a prompt,
    and ask OpenAI to produce mitigation steps. Returns the model's textual answer.
    """
    results = similarity_search(query, k=top_k)
    context_sections = []
    for r in results:
        # include score and a trimmed version of the doc for clarity
        ctx = f"---\nScore: {r['score']:.4f}\n{r['document']}\n"
        context_sections.append(ctx)
    context = "\n\n".join(context_sections) if context_sections else "No similar vulnerabilities found."

    # Construct a clear instruction prompt and ask for structured JSON output.
    prompt = f"""
You are a cybersecurity mitigation assistant. Using the provided vulnerability context, produce a
concise mitigation plan with the following structure in **a valid JSON object** with the top-level keys:
**"Summary"** (one-sentence summary of the problem),
**"MitigationSteps"** (an array of step-by-step actions),
**"Priority"** (Low / Medium / High / Critical),
**"EstimatedEffort"** (Low / Medium / High / Approximate), and
**"References"** (an array of Vulnerability IDs mentioned from the context).
**Do NOT include any text outside the JSON object.** Be concise but actionable.

Context:
{context}

User Question:
{query}
"""
    # Call OpenAI
    answer = call_openai(prompt)

    # *** NEW: Parse the OpenAI text response into a Python dictionary (JSON object) ***
    try:
        json_output = json.loads(answer)
        return json_output
    except json.JSONDecodeError as e:
        logger.error(f"Failed to parse JSON response: {e}. Raw response: {answer}")
        return {"error": "Mitigation generation failed or did not return valid JSON.", "raw_response": answer}

In [19]:
print("\n✅ Mitigation Agent using OpenAI is ready!")
print("Type your question (e.g., 'How to mitigate SQL injection in product X?') or 'exit' to quit.\n")

while True:
    user_q = input("Question> ").strip()
    if not user_q:
        continue
    if user_q.lower() in ("exit", "quit"):
        print("Goodbye — Mitigation Agent stopped.")
        break
    try:
        # The 'out' variable now holds the Python dictionary (JSON object)
        out = mitigation_agent(user_q, top_k=TOP_K)

        # *** NEW: Logic to save the output to a JSON file ***
        if "error" not in out:
            # Create a unique filename using a timestamp
            timestamp = time.strftime("%Y%m%d-%H%M%S")
            file_name = f"mitigation_plan_{timestamp}.json"

            # Save the dictionary to the file
            with open(file_name, 'w', encoding='utf-8') as f:
                # Use indent=4 for human-readable formatting
                json.dump(out, f, ensure_ascii=False, indent=4)

            print(f"\n✅ Mitigation strategy successfully saved to: {file_name}")

        print("\n--- Mitigation Agent Response ---\n")
        # Print the output as formatted JSON for better readability
        print(json.dumps(out, indent=4))
        print("\n-------------------------------\n")
    except Exception as e:
        logger.exception("Error while generating mitigation: %s", e)
        print("An error occurred. See logs for details.")


✅ Mitigation Agent using OpenAI is ready!
Type your question (e.g., 'How to mitigate SQL injection in product X?') or 'exit' to quit.

Question> Explain how to reduce the risk of vulnerabilities in a Microsoft Exchange product

✅ Mitigation strategy successfully saved to: mitigation_plan_20251105-152009.json

--- Mitigation Agent Response ---

{
    "Summary": "Multiple high-severity vulnerabilities in Microsoft Exchange Server can lead to unauthorized access and privilege escalation.",
    "MitigationSteps": [
        "Apply the latest security patches and updates from Microsoft for Exchange Server.",
        "Implement strict input validation mechanisms to prevent spoofing attacks.",
        "Review and enhance authentication algorithms to ensure they are correctly implemented.",
        "Enforce strong authentication methods to mitigate weak authentication risks.",
        "Conduct regular security audits and vulnerability assessments on the Exchange Server environment."
    ],
   