# Reproducibility
- Notebook: tab_rq1_scale.ipynb
- Data sources:
  - replication/data/platform-counts.json
  - replication/data/platform-star-concentration-size.json
  - replication/data/platforms/<platform>/top100.json
  - replication/data/platforms/analytics-summary.json
- Expected output:
  - Table: Platform scale and engagement (top-100 GitHub subset) (tab:rq1_scale)


In [1]:
from pathlib import Path
import json
import math
import pandas as pd
import numpy as np


DATA_DIR = (Path.cwd() / ".." / "data").resolve()
PLATFORM_DIR = DATA_DIR / "platforms"

with (DATA_DIR / "platform-counts.json").open("r", encoding="utf-8") as handle:
    platform_counts = json.load(handle)["platforms"]

PLATFORMS = [entry["platform"] for entry in platform_counts]


def load_top100(platform):
    path = PLATFORM_DIR / platform / "top100.json"
    with path.open("r", encoding="utf-8") as handle:
        return json.load(handle)


from datetime import datetime, timezone

with (DATA_DIR / "platform-star-concentration-size.json").open("r", encoding="utf-8") as handle:
    star_size = json.load(handle)["platform_sizes"]

analytics_summary = json.loads((DATA_DIR / "platforms" / "analytics-summary.json").read_text(encoding="utf-8"))
reference_time = analytics_summary.get("generatedAt")
if reference_time:
    reference_now = datetime.fromisoformat(reference_time.replace("Z", "+00:00"))
else:
    reference_now = datetime.now(timezone.utc)

ONE_YEAR_DAYS = 365


Metric definitions:
- Avg Stars = total stars / top-100 count
- Avg Downloads = total downloads / top-100 count
- Issue Density = total open issues / total stars
- Abandonment Rate = fraction of repos with last update older than 365 days (relative to analytics-summary.json generatedAt)
- Star Concentration Index = Avg Stars / log10(platformSize)


In [2]:
base_metrics = {}
for entry in platform_counts:
    platform = entry["platform"]
    top100_data = load_top100(platform)
    top100 = top100_data.get("top100") or []

    total_stars = sum((p.get("githubStats") or {}).get("stars", 0) or 0 for p in top100)
    total_downloads = sum(p.get("downloads", 0) or 0 for p in top100)
    total_issues = sum((p.get("githubStats") or {}).get("openIssues", 0) or 0 for p in top100)

    base_metrics[platform] = {
        "display": entry["display"],
        "all_plugins": entry["all_plugins"],
        "oss_github": entry["oss_github"],
        "top100_count": len(top100),
        "total_stars": total_stars,
        "total_downloads": total_downloads,
        "total_issues": total_issues,
    }


In [3]:
for platform, metrics in base_metrics.items():
    count = metrics["top100_count"]
    metrics["avg_stars"] = metrics["total_stars"] / count if count else 0
    metrics["avg_downloads"] = metrics["total_downloads"] / count if count else 0


In [4]:
for platform, metrics in base_metrics.items():
    stars = metrics["total_stars"]
    issues = metrics["total_issues"]
    metrics["issue_density"] = issues / stars if stars else 0


In [5]:
for platform, metrics in base_metrics.items():
    top100 = load_top100(platform).get("top100") or []
    abandoned = 0
    for plugin in top100:
        last_updated = (plugin.get("githubStats") or {}).get("lastUpdated") or plugin.get("lastUpdated")
        if not last_updated:
            continue
        try:
            last_dt = datetime.fromisoformat(last_updated.replace("Z", "+00:00"))
        except ValueError:
            continue
        if last_dt.tzinfo is None:
            last_dt = last_dt.replace(tzinfo=timezone.utc)
        if (reference_now - last_dt).days > ONE_YEAR_DAYS:
            abandoned += 1
    metrics["abandonment"] = abandoned / metrics["top100_count"] if metrics["top100_count"] else 0


In [6]:
for platform, metrics in base_metrics.items():
    platform_size = star_size.get(platform, metrics["oss_github"])
    denom = math.log10(platform_size) if platform_size else 0
    metrics["star_concentration_index"] = metrics["avg_stars"] / denom if denom else 0


In [7]:
rows = []
for platform in [entry["platform"] for entry in platform_counts]:
    m = base_metrics[platform]
    rows.append({
        "Platform": m["display"],
        "All": m["all_plugins"],
        "OSS": m["oss_github"],
        "Avg Stars": m["avg_stars"],
        "Avg Downloads": m["avg_downloads"],
        "Issue Density": m["issue_density"],
        "Abandon.": m["abandonment"],
        "Star Conc. Idx": m["star_concentration_index"],
    })

df = pd.DataFrame(rows)

formatters = {
    "All": lambda x: f"{int(x):,}",
    "OSS": lambda x: f"{int(x):,}",
    "Avg Stars": lambda x: f"{x:,.2f}",
    "Avg Downloads": lambda x: f"{x:,.2f}",
    "Issue Density": lambda x: f"{x:.5f}",
    "Abandon.": lambda x: f"{x:.2f}",
    "Star Conc. Idx": lambda x: f"{x:,.2f}",
}

for col, fmt in formatters.items():
    df[col] = df[col].map(fmt)

df


Unnamed: 0,Platform,All,OSS,Avg Stars,Avg Downloads,Issue Density,Abandon.,Star Conc. Idx
0,Chrome,246379,7459,7917.51,2336700.0,0.04259,0.26,3287.67
1,Firefox,110320,7862,4798.09,172425.77,0.02534,0.43,1231.69
2,JetBrains,10003,5849,5015.45,5873044.66,0.01837,0.18,1331.39
3,VS Code,86145,25136,1844.66,29508137.3,0.17646,0.24,419.21
4,Sublime,5581,4694,594.52,494755.44,0.03004,0.79,161.93
5,WordPress,59000,3986,317.18,71800.0,0.355,0.34,88.09
6,Minecraft,98600,26089,303.85,22176204.03,0.17548,0.03,68.8
7,Obsidian,2656,2656,915.75,387343.37,0.09797,0.47,267.43
8,Home Assistant,5187,2389,710.25,43566.95,0.07762,0.34,210.24
