In [1]:
# Cell 1: Install and import libraries create output folder
!pip install -q pandas requests

import os, json, time, requests
import pandas as pd
from datetime import datetime, timezone

# create output folder
os.makedirs("/content/outbox", exist_ok=True)

In [2]:
# Cell 2: Helper utilities used by the pipeline
def utc_now():
    """Return current UTC timestamp in ISO format."""
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

def ensure_list(x):
    """Ensure the value is a list."""
    if x is None:
        return []
    if isinstance(x, list):
        return x
    return [x]

def uniq(seq):
    """Return items in seq preserving order and removing duplicates/None."""
    seen = set()
    out = []
    for x in seq:
        if x is None:
            continue
        if x in seen:
            continue
        seen.add(x)
        out.append(x)
    return out

In [3]:
# Cell 3: Fetch CIRCL API (primary source). Adds headers, retries, and basic debugging.
CIRCL_URL =  "https://cve.circl.lu/api/last"

def fetch_circl(max_retries=3, timeout=30):
    headers = {
        "User-Agent": "CyberMind-ThreatCollector/1.0 (+https://example.org)",
        "Accept": "application/json"
    }
    for i in range(max_retries):
        try:
            r = requests.get(CIRCL_URL, timeout=timeout, headers=headers)
            r.raise_for_status()
            ctype = (r.headers.get("Content-Type") or "").lower()
            if "json" not in ctype:
                print(f"[CIRCL] Unexpected content-type: {ctype}")
                return []
            data = r.json()
            if isinstance(data, list):
                print(f"[CIRCL] raw items: {len(data)}")
                if len(data) > 0:
                    # debug keys of first item
                    print("[CIRCL] first item keys:", list(data[0].keys()))
                return data
            else:
                print(f"[CIRCL] Unexpected JSON root type: {type(data)}")
                return []
        except Exception as e:
            print(f"[CIRCL] attempt {i+1} failed: {e}")
            time.sleep(2 * (i+1))
    return []

In [4]:
# Cell 4 : normalize OSV/CVE5 records and always produce an identifier
import re, hashlib

ALLOW_EMPTY_DESCRIPTION = True
FALLBACK_DESC_TEXT = "No description provided by CNA yet â€” details pending. (Auto-filled by CyberMind)"
CVE_RE = re.compile(r"\bCVE-\d{4}-\d{4,7}\b", re.IGNORECASE)

def _pick_cve_from_aliases(aliases):
    if not isinstance(aliases, list):
        return None
    for a in aliases:
        if isinstance(a, str):
            m = CVE_RE.search(a)
            if m:
                return m.group(0).upper()
    return None

def _pick_cve_from_refs(refs):
    if not isinstance(refs, list):
        return None
    for r in refs:
        if isinstance(r, dict):
            url = r.get("url")
            if url:
                m = CVE_RE.search(url)
                if m:
                    return m.group(0).upper()
        elif isinstance(r, str):
            m = CVE_RE.search(r)
            if m:
                return m.group(0).upper()
    return None

def _pick_description_any_lang(cna):
    descs = cna.get("descriptions") or []
    if isinstance(descs, list):
        for d in descs:
            if isinstance(d, dict) and d.get("lang") == "en" and d.get("value"):
                return d["value"]
        vals = [d.get("value") for d in descs if isinstance(d, dict) and d.get("value")]
        if vals:
            return " ".join(vals)
    return ""

def _extract_cvss_from_metrics(cna):
    metrics = cna.get("metrics") or []
    if not isinstance(metrics, list) or not metrics:
        return None
    m0 = metrics[0] or {}
    for key in ("cvssV3_1", "cvssV3"):
        obj = m0.get(key)
        if isinstance(obj, dict):
            v = obj.get("vectorString") or obj.get("baseScore")
            if v is not None:
                return str(v)
    for v in m0.values():
        if isinstance(v, dict):
            vs = v.get("vectorString") or v.get("baseScore")
            if vs is not None:
                return str(vs)
    return None

def _extract_refs_generic(obj):
    refs = []
    for r in (obj.get("references") or []):
        if isinstance(r, dict) and r.get("url"):
            refs.append(r["url"])
        elif isinstance(r, str):
            refs.append(r)
    seen, out = set(), []
    for x in refs:
        if x and x not in seen:
            seen.add(x); out.append(x)
    return out

def _stable_temp_id(item):
    # generate a stable temp id from a hash of the item (last resort)
    s = json.dumps(item, sort_keys=True, ensure_ascii=False)
    h = hashlib.sha1(s.encode("utf-8")).hexdigest()[:12]
    return f"TEMP-{h}"

def normalize_osv_item(it):
    cve_id, title, description, cvss_v3, refs = None, None, "", None, []
    quality = "ok"

    # Case A: CVE 5.x shape
    if "cveMetadata" in it and "containers" in it:
        meta = it.get("cveMetadata", {}) or {}
        cna  = (it.get("containers", {}) or {}).get("cna", {}) or {}
        cve_id = meta.get("cveId") or meta.get("ID")
        title = cna.get("title")
        description = _pick_description_any_lang(cna)
        if not description:
            description = title or ""
        cvss_v3 = _extract_cvss_from_metrics(cna)
        refs = _extract_refs_generic(cna)
        # Try to upgrade to a CVE from aliases/refs if missing
        if not cve_id:
            cve_id = _pick_cve_from_aliases(cna.get("aliases") or []) or _pick_cve_from_refs(cna.get("references") or [])

    # Case B: OSV/legacy shape
    else:
        base_id = it.get("id") or it.get("cve")  # often GHSA-...
        title = (it.get("aliases")[0] if isinstance(it.get("aliases"), list) and it.get("aliases") else None)
        description = it.get("details") or it.get("summary") or it.get("description") or ""
        sev = it.get("severity") or []
        if isinstance(sev, list) and sev:
            cvss_v3 = sev[0].get("score") or sev[0].get("type")
        if not cvss_v3 and it.get("cvss") is not None:
            cvss_v3 = str(it.get("cvss"))
        refs = _extract_refs_generic(it)
        # Prefer a CVE if available in aliases/refs; else keep base_id (GHSA etc.)
        cve_from_alias = _pick_cve_from_aliases(it.get("aliases") or [])
        cve_from_refs  = _pick_cve_from_refs(it.get("references") or [])
        cve_id = cve_from_alias or cve_from_refs or base_id

    # Fallbacks
    if (description is None or str(description).strip() == "") and ALLOW_EMPTY_DESCRIPTION:
        description = FALLBACK_DESC_TEXT
        quality = "fallback_desc"
    if not cve_id:
        # last resort: stable temp id so record is not dropped
        cve_id = _stable_temp_id(it)

    return {
        "cve_id": cve_id,
        "title": title,
        "description": description,
        "cvss_v3": cvss_v3,
        "references": refs,
        "collected_at": utc_now(),
        "source": "CIRCL",
        "quality": quality
    }

def normalize_osv_list(items):
    out = []
    for it in items:
        rec = normalize_osv_item(it)
        # accept any record that now always has an identifier
        if rec.get("cve_id"):
            out.append(rec)
    return out

In [5]:
# Cell 5: Universal NVD loader supports v2.0 ("vulnerabilities"), v1.1 ("CVE_Items"), and CVE 5.x list
import os, json, gzip, time

NVD_PATH = "/content/nvdcve-2.0-2025.json"

def _open_json_any(path):
    if not os.path.exists(path):
        print("[NVD local] not found:", path)
        return None
    try:
        if path.endswith(".gz"):
            with gzip.open(path, "rt", encoding="utf-8") as f:
                return json.load(f)
        else:
            with open(path, "r", encoding="utf-8") as f:
                return json.load(f)
    except Exception as e:
        print("[NVD local] read error:", e)
        return None

def _utc_now():
    return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())

def _uniq(seq):
    seen, out = set(), []
    for x in seq:
        if x and x not in seen:
            seen.add(x); out.append(x)
    return out

def _nvd_v20_to_rows(obj):
    rows = []
    vulns = obj.get("vulnerabilities", [])
    for v in vulns:
        c = v.get("cve", {}) or {}
        cve_id = c.get("id")

        # descriptions
        desc = ""
        for d in (c.get("descriptions") or []):
            if isinstance(d, dict) and d.get("lang") == "en" and d.get("value"):
                desc = d["value"]; break
        if not desc:
            vals = [d.get("value") for d in (c.get("descriptions") or []) if isinstance(d, dict) and d.get("value")]
            if vals: desc = " ".join(vals)

        # references
        refs = []
        for r in (c.get("references") or []):
            if isinstance(r, dict) and r.get("url"):
                refs.append(r["url"])

        # CVSS: try v3.1 then v3.0 then v2
        cvss_v3 = None
        metrics = c.get("metrics") or {}
        for key in ("cvssMetricV31", "cvssMetricV30"):
            arr = metrics.get(key)
            if isinstance(arr, list) and arr:
                data = arr[0].get("cvssData", {})
                cvss_v3 = data.get("vectorString") or str(data.get("baseScore") or "")
                if cvss_v3: break
        if not cvss_v3:
            arr = metrics.get("cvssMetricV2")
            if isinstance(arr, list) and arr:
                data = arr[0].get("cvssData", {})
                cvss_v3 = data.get("vectorString") or str(data.get("baseScore") or "")

        # placeholders
        title = "No title available yet"
        if not desc:
            desc = "Description not yet published â€” placeholder."
        if not cvss_v3:
            cvss_v3 = "Pending CVSS score"
        if not refs:
            refs = ["No references published"]

        rows.append({
            "cve_id": cve_id or f"TEMP-{hash(str(v)) & 0xfffffff}",
            "title": title,
            "description": desc,
            "cvss_v3": cvss_v3,
            "references": _uniq(refs),
            "collected_at": _utc_now(),
            "source": "NVD"
        })
    return rows

def _nvd_v11_to_rows(obj):
    rows = []
    items = obj.get("CVE_Items", [])
    for it in items:
        cve = it.get("cve", {})
        meta = cve.get("CVE_data_meta", {})
        cve_id = meta.get("ID")

        desc = ""
        desc_list = (cve.get("description", {}) or {}).get("description_data", [])
        for d in desc_list:
            if d.get("lang") == "en" and d.get("value"):
                desc = d["value"]; break
        if not desc and desc_list:
            desc = desc_list[0].get("value", "")

        refs = [r.get("url") for r in (cve.get("references", {}) or {}).get("reference_data", []) if r.get("url")]

        impact = it.get("impact", {})
        cvss_v3 = None
        if isinstance(impact.get("baseMetricV3"), dict):
            cv = impact["baseMetricV3"].get("cvssV3", {})
            cvss_v3 = cv.get("vectorString") or str(cv.get("baseScore") or "")

        title = meta.get("TITLE") or "No title available yet"
        if not desc: desc = "Description not yet published â€” placeholder."
        if not cvss_v3: cvss_v3 = "Pending CVSS score"
        if not refs: refs = ["No references published"]

        rows.append({"cve_id": cve_id or f"TEMP-{hash(str(it)) & 0xfffffff}",
            "title": title,
            "description": desc,
            "cvss_v3": cvss_v3,
            "references": _uniq(refs),
            "collected_at": _utc_now(),
            "source": "NVD"
        })
    return rows

def _cve5_list_to_rows(lst):
    rows = []
    for rec in lst:
        meta = rec.get("cveMetadata", {}) or {}
        cna  = (rec.get("containers", {}) or {}).get("cna", {}) or {}

        cve_id = meta.get("cveId")
        title = cna.get("title") or "No title available yet"

        desc = ""
        for d in (cna.get("descriptions") or []):
            if isinstance(d, dict) and d.get("lang") == "en" and d.get("value"):
                desc = d["value"]; break
        if not desc:
            vals = [d.get("value") for d in (cna.get("descriptions") or []) if isinstance(d, dict) and d.get("value")]
            if vals: desc = " ".join(vals)

        refs = []
        for r in (cna.get("references") or []):
            if isinstance(r, dict) and r.get("url"):
                refs.append(r["url"])

        if not desc: desc = "Description not yet published â€” placeholder."
        cvss_v3 = "Pending CVSS score"
        if not refs: refs = ["No references published"]

        rows.append({
            "cve_id": cve_id or f"TEMP-{hash(str(rec)) & 0xfffffff}",
            "title": title,
            "description": desc,
            "cvss_v3": cvss_v3,
            "references": _uniq(refs),
            "collected_at": _utc_now(),
            "source": "NVD"
        })
    return rows

nvd_norm = []
obj = _open_json_any(NVD_PATH)
if obj is None:
    print("[NVD local] file not found or unreadable:", NVD_PATH)
elif isinstance(obj, dict) and "vulnerabilities" in obj:
    nvd_norm = _nvd_v20_to_rows(obj)
elif isinstance(obj, dict) and "CVE_Items" in obj:
    nvd_norm = _nvd_v11_to_rows(obj)
elif isinstance(obj, list):
    nvd_norm = _cve5_list_to_rows(obj)
else:
    print("[NVD local] unknown format keys:", list(obj.keys())[:10] if isinstance(obj, dict) else type(obj))

print("[NVD local] normalized:", len(nvd_norm))

[NVD local] normalized: 31902


In [6]:
#  Cell 6 : use nvd_norm if available else load then merge with API

import os, json, time, gzip

# try to reuse api_norm; if missing, fetch again
try:
    _ = len(api_norm)
except NameError:
    try:
        _ = len(api_raw)
    except NameError:
        api_raw = fetch_circl()
    api_norm = normalize_osv_list(api_raw)

# check if nvd_norm exists and non-empty; otherwise load from NVD_PATH using universal loader
try:
    current_nvd_len = len(nvd_norm)
except NameError:
    current_nvd_len = 0

def _open_json_any(path):
    if not os.path.exists(path):
        return None
    try:
        if path.endswith(".gz"):
            with gzip.open(path, "rt", encoding="utf-8") as f:
                return json.load(f)
        else:
            with open(path, "r", encoding="utf-8") as f:
                return json.load(f)
    except Exception:
        return None

def _utc_now():
    return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())

def _uniq(seq):
    seen, out = set(), []
    for x in seq:
        if x and x not in seen:
            seen.add(x); out.append(x)
    return out

def _nvd_v20_to_rows(obj):
    rows = []
    vulns = obj.get("vulnerabilities", [])
    for v in vulns:
        c = v.get("cve", {}) or {}
        cve_id = c.get("id")
        desc = ""
        for d in (c.get("descriptions") or []):
            if isinstance(d, dict) and d.get("lang") == "en" and d.get("value"):
                desc = d["value"]; break
        if not desc:
            vals = [d.get("value") for d in (c.get("descriptions") or []) if isinstance(d, dict) and d.get("value")]
            if vals: desc = " ".join(vals)
        refs = []
        for r in (c.get("references") or []):
            if isinstance(r, dict) and r.get("url"):
                refs.append(r["url"])
        cvss_v3 = None
        metrics = c.get("metrics") or {}
        for key in ("cvssMetricV31", "cvssMetricV30"):
            arr = metrics.get(key)
            if isinstance(arr, list) and arr:
                data = arr[0].get("cvssData", {})
                cvss_v3 = data.get("vectorString") or str(data.get("baseScore") or "")
                if cvss_v3: break
        if not cvss_v3:
            arr = metrics.get("cvssMetricV2")
            if isinstance(arr, list) and arr:
                data = arr[0].get("cvssData", {})
                cvss_v3 = data.get("vectorString") or str(data.get("baseScore") or "")
        title = "No title available yet"
        if not desc: desc = "Description not yet published â€” placeholder."
        if not cvss_v3: cvss_v3 = "Pending CVSS score"
        if not refs: refs = ["No references published"]
        rows.append({
            "cve_id": cve_id or f"TEMP-{hash(str(v)) & 0xfffffff}",
            "title": title,
            "description": desc,
            "cvss_v3": cvss_v3,
            "references": _uniq(refs),
            "collected_at": _utc_now(),
            "source": "NVD"
        })
    return rows

def _nvd_v11_to_rows(obj):
    rows = []
    items = obj.get("CVE_Items", [])
    for it in items:
        cve = it.get("cve", {})
        meta = cve.get("CVE_data_meta", {})
        cve_id = meta.get("ID")
        desc = ""
        desc_list = (cve.get("description", {}) or {}).get("description_data", [])
        for d in desc_list:
            if d.get("lang") == "en" and d.get("value"):
                desc = d["value"]; break
        if not desc and desc_list:
            desc = desc_list[0].get("value", "")
        refs = [r.get("url") for r in (cve.get("references", {}) or {}).get("reference_data", []) if r.get("url")]
        impact = it.get("impact", {})
        cvss_v3 = None
        if isinstance(impact.get("baseMetricV3"), dict):
            cv = impact["baseMetricV3"].get("cvssV3", {})
            cvss_v3 = cv.get("vectorString") or str(cv.get("baseScore") or "")
        title = meta.get("TITLE") or "No title available yet"
        if not desc: desc = "Description not yet published â€” placeholder."
        if not cvss_v3: cvss_v3 = "Pending CVSS score"
        if not refs: refs = ["No references published"]
        rows.append({
            "cve_id": cve_id or f"TEMP-{hash(str(it)) & 0xfffffff}",
            "title": title,
            "description": desc,
            "cvss_v3": cvss_v3,
            "references": _uniq(refs),
            "collected_at": _utc_now(),
            "source": "NVD"
        })
    return rows

def _cve5_list_to_rows(lst):
    rows = []
    for rec in lst:
        meta = rec.get("cveMetadata", {}) or {}
        cna  = (rec.get("containers", {}) or {}).get("cna", {}) or {}
        cve_id = meta.get("cveId")
        title = cna.get("title") or "No title available yet"
        desc = ""
        for d in (cna.get("descriptions") or []):
            if isinstance(d, dict) and d.get("lang") == "en" and d.get("value"):
                desc = d["value"]; break
        if not desc:
            vals = [d.get("value") for d in (cna.get("descriptions") or []) if isinstance(d, dict) and d.get("value")]
            if vals: desc = " ".join(vals)
        refs = []
        for r in (cna.get("references") or []):
            if isinstance(r, dict) and r.get("url"):
                refs.append(r["url"])
        if not desc: desc = "Description not yet published â€” placeholder."
        cvss_v3 = "Pending CVSS score"
        if not refs: refs = ["No references published"]
        rows.append({
            "cve_id": cve_id or f"TEMP-{hash(str(rec)) & 0xfffffff}",
            "title": title,
            "description": desc,
            "cvss_v3": cvss_v3,
            "references": _uniq(refs),
            "collected_at": _utc_now(),
            "source": "NVD"
        })
    return rows

# if nvd_norm missing/empty, try reloading from NVD_PATH used in Cell 5 (adjust if needed)
if not current_nvd_len:
    try:
        _ = NVD_PATH
    except NameError:
        NVD_PATH = "/content/nvdcve-2.0-2025.json"
    obj = _open_json_any(NVD_PATH)
    if obj is None:
        nvd_norm = []
    elif isinstance(obj, dict) and "vulnerabilities" in obj:
        nvd_norm = _nvd_v20_to_rows(obj)
    elif isinstance(obj, dict) and "CVE_Items" in obj:
        nvd_norm = _nvd_v11_to_rows(obj)
    elif isinstance(obj, list):
        nvd_norm = _cve5_list_to_rows(obj)
    else:
        nvd_norm = []

# merge
def merge_prefer_api(api_rows, nvd_rows):
    def ensure_list(x): return x if isinstance(x, list) else ([] if x is None else [x])
    def uniq(seq):
        seen, out = set(), []
        for x in seq:
            if x and x not in seen:
                seen.add(x); out.append(x)
        return out

    by_key = {}
    def pick_key(r, idx):
        k = r.get("cve_id")
        if not k or str(k).strip() == "":
            return f"NOID-{idx}-{hash(json.dumps(r, sort_keys=True, ensure_ascii=False)) & 0xfffffff}"
        return k

    for i, r in enumerate(nvd_rows):
        by_key[pick_key(r, i)] = r

    base = len(by_key)
    for j, a in enumerate(api_rows):
        key = pick_key(a, base + j)
        if key in by_key:
            merged = dict(by_key[key])
            for k in ["title", "description", "cvss_v3"]:
                v = a.get(k)
                if v:
                    merged[k] = v
            refs = uniq(ensure_list(by_key[key].get("references")) + ensure_list(a.get("references")))
            merged["references"] = refs
            merged["source"] = "CIRCL"
            merged["collected_at"] = a.get("collected_at") or merged.get("collected_at")
            by_key[key] = merged
        else:
            by_key[key] = a
    return list(by_key.values())

print(f"[CIRCL] normalized: {len(api_norm)}")
print(f"[NVD local] normalized (now): {len(nvd_norm)}")
final_rows = merge_prefer_api(api_norm, nvd_norm)
print(f"[Merge] merged total: {len(final_rows)} âœ…")

[CIRCL] raw items: 30
[CIRCL] first item keys: ['schema_version', 'id', 'modified', 'published', 'aliases', 'details', 'severity', 'affected', 'references', 'database_specific']
[CIRCL] normalized: 30
[NVD local] normalized (now): 31902
[Merge] merged total: 31927 âœ…


In [7]:
# Cell 7: Save final results to JSON + CSV and print summary
OUT_JSON = "/content/outbox/cybermind_threats.json"
OUT_CSV  = "/content/outbox/cybermind_threats.csv"

# Save JSON
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(final_rows, f, indent=2, ensure_ascii=False)

# Prepare CSV rows (short description + joined refs)
rows_csv = []
for r in final_rows:
    desc = r.get("description","") or ""
    desc_short = (desc[:200] + "â€¦") if len(desc) > 200 else desc
    refs = "; ".join(ensure_list(r.get("references")))
    rows_csv.append({
        "cve_id": r.get("cve_id",""),
        "title": r.get("title") or "",
        "description_short": desc_short,
        "cvss_v3": r.get("cvss_v3") or "",
        "references": refs,
        "source": r.get("source",""),
        "collected_at": r.get("collected_at",""),
    })

pd.DataFrame(rows_csv).to_csv(OUT_CSV, index=False, encoding="utf-8")

print("âœ… Saved JSON:", OUT_JSON)
print("âœ… Saved CSV: ", OUT_CSV)
print("ðŸ“Š Counts â†’ API:", len(api_norm), "| NVD local:", len(nvd_norm), "| Final:", len(final_rows))

âœ… Saved JSON: /content/outbox/cybermind_threats.json
âœ… Saved CSV:  /content/outbox/cybermind_threats.csv
ðŸ“Š Counts â†’ API: 30 | NVD local: 31902 | Final: 31927
