# Reproducibility
- Notebook: tab_rq3_generic_summary.ipynb
- Data sources:
  - replication/data/platform-counts.json
  - replication/data/platforms/<platform>/top100.json
  - replication/data/classification/classifications_groq.json
- Expected output:
  - Table: Classification coverage and top eight generic categories by platform (tab:rq3_generic_summary)


In [1]:
from pathlib import Path
import json
import pandas as pd
import numpy as np

DATA_DIR = (Path.cwd() / ".." / "data").resolve()
PLATFORM_DIR = DATA_DIR / "platforms"

with (DATA_DIR / "platform-counts.json").open("r", encoding="utf-8") as handle:
    platform_counts = json.load(handle)["platforms"]

PLATFORMS = [entry["platform"] for entry in platform_counts]


def load_top100(platform):
    path = PLATFORM_DIR / platform / "top100.json"
    with path.open("r", encoding="utf-8") as handle:
        return json.load(handle)["top100"]


CLASSIFICATIONS_PATH = DATA_DIR / "classification" / "classifications_groq.json"

with CLASSIFICATIONS_PATH.open("r", encoding="utf-8") as handle:
    classifications = json.load(handle)


In [2]:
from collections import Counter

CATEGORY_MAP = {
    "productivity_workflow": "Prod.",
    "ui_customization": "UI Cust.",
    "developer_tools": "Dev Tools",
    "utilities_misc": "Utilities",
    "language_support": "Language",
    "integrations_connectors": "Integrations",
    "privacy_security": "Privacy/Sec.",
    "code_quality_linting": "Code Qual.",
}


def normalize_repo(raw):
    if not raw or not isinstance(raw, str):
        return None
    repo = raw.strip()
    lower = repo.lower()
    if (lower.startswith("http://") or lower.startswith("https://") or lower.startswith("git@")) and "github.com" not in lower:
        return None
    repo = repo.replace("https://github.com/", "").replace("http://github.com/", "")
    repo = repo.replace("git@github.com:", "")
    repo = repo.replace("github.com/", "")
    repo = repo.replace(".git", "")
    repo = repo.split("#")[0].split("?")[0].rstrip("/")
    parts = repo.split("/")
    if len(parts) < 2:
        return None
    return f"{parts[0]}/{parts[1]}"


def dedupe_by_repo(entries):
    seen = set()
    unique = []
    for entry in entries:
        repo = normalize_repo(entry.get("repo"))
        if not repo:
            continue
        key = repo
        if key in seen:
            continue
        seen.add(key)
        unique.append(entry)
    return unique


def category_counts(entries):
    counter = Counter()
    for entry in entries:
        cats = entry.get("generic_categories") or []
        unique = set([c for c in cats if isinstance(c, str) and c])
        for cat in unique:
            counter[cat] += 1
    return counter


In [3]:
rows = []

by_platform = {}
for entry in classifications:
    platform = entry.get("platform") or "unknown"
    by_platform.setdefault(platform, []).append(entry)

all_entries_unique = dedupe_by_repo(classifications)
all_counts = category_counts(all_entries_unique)
all_unique_count = len(all_entries_unique)

for entry in platform_counts:
    platform = entry["platform"]
    top100 = load_top100(platform)
    total_entries = len(top100)

    entries = by_platform.get(platform, [])
    unique_entries = dedupe_by_repo(entries)

    classified = len(entries)
    coverage = (classified / total_entries * 100) if total_entries else 0

    confidences = [e.get("confidence") for e in entries if isinstance(e.get("confidence"), (int, float))]
    avg_conf = float(np.mean(confidences)) if confidences else 0

    missing_readme = sum(1 for e in entries if e.get("readme_missing") is True)
    readme_avail = ((classified - missing_readme) / classified * 100) if classified else 0

    counts = category_counts(unique_entries)
    unique_count = len(unique_entries)

    row = {
        "Platform": entry["display"],
        "Classified": classified,
        "Coverage (%)": coverage,
        "Avg Conf.": avg_conf,
        "README Avail. (%)": readme_avail,
    }

    for key, label in CATEGORY_MAP.items():
        value = (counts.get(key, 0) / unique_count * 100) if unique_count else 0
        row[label] = value

    rows.append(row)

# Overall row
classified_all = len(classifications)
coverage_all = (classified_all / (len(platform_counts) * 100) * 100) if platform_counts else 0
confidences_all = [e.get("confidence") for e in classifications if isinstance(e.get("confidence"), (int, float))]
avg_conf_all = float(np.mean(confidences_all)) if confidences_all else 0
missing_readme_all = sum(1 for e in classifications if e.get("readme_missing") is True)
readme_avail_all = ((classified_all - missing_readme_all) / classified_all * 100) if classified_all else 0

overall_row = {
    "Platform": "Overall",
    "Classified": classified_all,
    "Coverage (%)": coverage_all,
    "Avg Conf.": avg_conf_all,
    "README Avail. (%)": readme_avail_all,
}

for key, label in CATEGORY_MAP.items():
    value = (all_counts.get(key, 0) / all_unique_count * 100) if all_unique_count else 0
    overall_row[label] = value

rows.append(overall_row)

df = pd.DataFrame(rows)

# Formatting

df["Classified"] = df["Classified"].map(lambda x: f"{int(x):,}")
df["Coverage (%)"] = df["Coverage (%)"].map(lambda x: f"{x:.1f}")
df["Avg Conf."] = df["Avg Conf."].map(lambda x: f"{x:.2f}")
df["README Avail. (%)"] = df["README Avail. (%)"].map(lambda x: f"{x:.1f}")

for label in CATEGORY_MAP.values():
    df[label] = df[label].map(lambda x: f"{x:.1f}")

df


Unnamed: 0,Platform,Classified,Coverage (%),Avg Conf.,README Avail. (%),Prod.,UI Cust.,Dev Tools,Utilities,Language,Integrations,Privacy/Sec.,Code Qual.
0,Chrome,95,95.0,0.87,97.9,49.5,23.2,40.0,25.3,3.2,7.4,28.4,1.1
1,Firefox,99,99.0,0.88,94.9,38.4,27.3,17.2,22.2,3.0,3.0,36.4,0.0
2,JetBrains,67,67.0,0.89,95.5,56.7,35.8,61.2,3.0,19.4,6.0,1.5,14.9
3,VS Code,90,90.0,0.91,100.0,67.8,14.4,63.3,4.4,27.8,3.3,0.0,12.2
4,Sublime,100,100.0,0.9,97.0,66.0,31.0,51.0,9.0,23.0,2.0,0.0,16.0
5,WordPress,91,91.0,0.86,86.8,45.1,40.7,48.4,12.1,4.4,19.8,6.6,1.1
6,Minecraft,100,100.0,0.88,94.0,23.0,46.0,28.0,35.0,2.0,1.0,1.0,0.0
7,Obsidian,100,100.0,0.87,100.0,86.0,53.0,21.0,11.0,3.0,11.0,0.0,2.0
8,Home Assistant,100,100.0,0.9,100.0,10.0,78.0,4.0,17.0,0.0,27.0,0.0,0.0
9,Overall,842,93.6,0.88,96.3,49.0,39.8,35.9,16.1,9.2,9.2,7.8,5.0
