# Reproducibility
- Notebook: tab_rq2_osv_summary.ipynb
- Data sources:
  - replication/data/platform-counts.json
  - replication/data/platforms/<platform>/top100.json
  - replication/data/sbom + osv/osv-scans/*.analysis.json
- Expected output:
  - Table: SBOM coverage and OSV vulnerability severity by platform (tab:rq2_osv_summary)


In [1]:
from pathlib import Path
import json
import math
import pandas as pd
import numpy as np


DATA_DIR = (Path.cwd() / ".." / "data").resolve()
PLATFORM_DIR = DATA_DIR / "platforms"

with (DATA_DIR / "platform-counts.json").open("r", encoding="utf-8") as handle:
    platform_counts = json.load(handle)["platforms"]

PLATFORMS = [entry["platform"] for entry in platform_counts]


def load_top100(platform):
    path = PLATFORM_DIR / platform / "top100.json"
    with path.open("r", encoding="utf-8") as handle:
        return json.load(handle)["top100"]


OSV_DIR = DATA_DIR / "sbom + osv" / "osv-scans"


Metric definitions:
- SBOM coverage = available SBOMs / top-100 entries
- OSV coverage = analyzed unique repos / unique repos
- Affected % = affected repos / analyzed repos
- High-Risk % = repos with critical or high / analyzed repos


In [2]:
def normalize_repo(raw):
    if not raw or not isinstance(raw, str):
        return None
    repo = raw.strip()
    lower = repo.lower()
    if (lower.startswith("http://") or lower.startswith("https://") or lower.startswith("git@")) and "github.com" not in lower:
        return None
    repo = repo.replace("https://github.com/", "").replace("http://github.com/", "")
    repo = repo.replace("git@github.com:", "")
    repo = repo.replace("github.com/", "")
    repo = repo.replace(".git", "")
    repo = repo.split("#")[0].split("?")[0].rstrip("/")
    parts = repo.split("/")
    if len(parts) < 2:
        return None
    return f"{parts[0]}/{parts[1]}"


def load_osv_summary():
    osv_map = {}
    if not OSV_DIR.exists():
        return osv_map
    for path in OSV_DIR.glob("*.analysis.json"):
        data = json.loads(path.read_text(encoding="utf-8"))
        repo = path.stem.replace(".analysis", "").replace("__", "/")
        key = normalize_repo(repo)
        if not key:
            continue
        summary = data.get("summary") or {}
        osv_map[key] = {
            "total": summary.get("totalVulnerabilities", 0) or 0,
            "critical": (summary.get("severityBreakdown") or {}).get("CRITICAL", 0) or 0,
            "high": (summary.get("severityBreakdown") or {}).get("HIGH", 0) or 0,
            "medium": (summary.get("severityBreakdown") or {}).get("MEDIUM", 0) or 0,
            "low": (summary.get("severityBreakdown") or {}).get("LOW", 0) or 0,
        }
    return osv_map

osv_map = load_osv_summary()


In [3]:
rows = []

all_top100 = []
all_unique_repos = set()

for entry in platform_counts:
    platform = entry["platform"]
    top100 = load_top100(platform)
    all_top100.extend(top100)

    # SBOM coverage on entry basis
    sbom_available = 0
    for plugin in top100:
        status = str(plugin.get("sbomStatus") or "").lower()
        if status in {"ok", "cached"}:
            sbom_available += 1

    # OSV coverage on unique repo basis (case-sensitive)
    repo_set = set()
    for plugin in top100:
        repo = normalize_repo(plugin.get("repo"))
        if repo:
            repo_set.add(repo)
            all_unique_repos.add(repo)

    analyzed = 0
    affected = 0
    high_risk = 0
    total_vulns = 0
    total_critical = 0
    total_high = 0
    total_medium = 0
    total_low = 0

    for repo in repo_set:
        osv = osv_map.get(repo)
        if not osv:
            continue
        analyzed += 1
        total_vulns += osv["total"]
        total_critical += osv["critical"]
        total_high += osv["high"]
        total_medium += osv["medium"]
        total_low += osv["low"]
        if osv["total"] > 0:
            affected += 1
        if osv["critical"] > 0 or osv["high"] > 0:
            high_risk += 1

    sbom_cov = (sbom_available / len(top100) * 100) if top100 else 0
    osv_cov = (analyzed / len(repo_set) * 100) if repo_set else 0
    affected_pct = (affected / analyzed * 100) if analyzed else 0
    high_risk_pct = (high_risk / analyzed * 100) if analyzed else 0

    rows.append({
        "Platform": entry["display"],
        "SBOM Cov. (%)": sbom_cov,
        "OSV Analyzed": analyzed,
        "OSV Cov. (%)": osv_cov,
        "Affected Repos": affected,
        "Affected (%)": affected_pct,
        "High-Risk Repos": high_risk,
        "High-Risk (%)": high_risk_pct,
        "Total Vulns": total_vulns,
        "Critical": total_critical,
        "High": total_high,
        "Medium": total_medium,
        "Low": total_low,
    })

# Overall row
sbom_available_all = 0
for plugin in all_top100:
    status = str(plugin.get("sbomStatus") or "").lower()
    if status in {"ok", "cached"}:
        sbom_available_all += 1

analyzed_all = 0
affected_all = 0
high_risk_all = 0
vulns_all = 0
critical_all = 0
high_all = 0
medium_all = 0
low_all = 0

for repo in all_unique_repos:
    osv = osv_map.get(repo)
    if not osv:
        continue
    analyzed_all += 1
    vulns_all += osv["total"]
    critical_all += osv["critical"]
    high_all += osv["high"]
    medium_all += osv["medium"]
    low_all += osv["low"]
    if osv["total"] > 0:
        affected_all += 1
    if osv["critical"] > 0 or osv["high"] > 0:
        high_risk_all += 1

sbom_cov_all = (sbom_available_all / len(all_top100) * 100) if all_top100 else 0
osv_cov_all = (analyzed_all / len(all_unique_repos) * 100) if all_unique_repos else 0
affected_pct_all = (affected_all / analyzed_all * 100) if analyzed_all else 0
high_risk_pct_all = (high_risk_all / analyzed_all * 100) if analyzed_all else 0

rows.append({
    "Platform": "Overall",
    "SBOM Cov. (%)": sbom_cov_all,
    "OSV Analyzed": analyzed_all,
    "OSV Cov. (%)": osv_cov_all,
    "Affected Repos": affected_all,
    "Affected (%)": affected_pct_all,
    "High-Risk Repos": high_risk_all,
    "High-Risk (%)": high_risk_pct_all,
    "Total Vulns": vulns_all,
    "Critical": critical_all,
    "High": high_all,
    "Medium": medium_all,
    "Low": low_all,
})

df = pd.DataFrame(rows)

formatters = {
    "SBOM Cov. (%)": lambda x: f"{x:.1f}",
    "OSV Cov. (%)": lambda x: f"{x:.1f}",
    "Affected (%)": lambda x: f"{x:.1f}",
    "High-Risk (%)": lambda x: f"{x:.1f}",
}

for col, fmt in formatters.items():
    df[col] = df[col].map(fmt)

for col in ["OSV Analyzed", "Affected Repos", "High-Risk Repos", "Total Vulns", "Critical", "High", "Medium", "Low"]:
    df[col] = df[col].map(lambda x: f"{int(x):,}")

df


Unnamed: 0,Platform,SBOM Cov. (%),OSV Analyzed,OSV Cov. (%),Affected Repos,Affected (%),High-Risk Repos,High-Risk (%),Total Vulns,Critical,High,Medium,Low
0,Chrome,88.0,74,77.9,45,60.8,5,6.8,1476,1,21,143,1311
1,Firefox,75.0,68,68.7,32,47.1,2,2.9,1019,0,3,86,930
2,JetBrains,88.0,48,71.6,7,14.6,0,0.0,101,0,0,2,99
3,VS Code,98.0,78,86.7,59,75.6,5,6.4,1043,0,7,74,962
4,Sublime,68.0,60,60.0,4,6.7,2,3.3,155,0,8,3,144
5,WordPress,78.0,62,68.1,36,58.1,6,9.7,1094,0,15,127,952
6,Minecraft,63.0,54,54.0,3,5.6,0,0.0,157,0,0,25,132
7,Obsidian,88.0,78,78.0,61,78.2,1,1.3,708,0,1,66,641
8,Home Assistant,87.0,74,74.0,49,66.2,1,1.4,811,1,0,43,767
9,Overall,81.4,582,70.8,285,49.0,22,3.8,6277,2,55,537,5683
