# Reproducibility
- Notebook: tab_rq1_dist.ipynb
- Data sources:
  - replication/data/platforms/<platform>/top100.json
- Expected output:
  - Table: Top languages and licenses across the 900-plugin top subset (tab:rq1_dist)


In [1]:
from pathlib import Path
import json
import pandas as pd
from collections import Counter

DATA_DIR = (Path.cwd() / ".." / "data").resolve()
PLATFORM_DIR = DATA_DIR / "platforms"

platforms = [p.name for p in PLATFORM_DIR.iterdir() if p.is_dir()]


def load_top100(platform):
    path = PLATFORM_DIR / platform / "top100.json"
    with path.open("r", encoding="utf-8") as handle:
        return json.load(handle)["top100"]


Metric definitions:
- Language count: primary repo language per entry (missing -> Unknown)
- License count: repo license (missing -> No License)
- Percentage = count / 900 * 100


In [2]:
language_counts = Counter()
license_counts = Counter()

LICENSE_MAP = {
    "MIT License": "MIT",
    "GNU General Public License v3.0": "GPLv3",
    "Apache License 2.0": "Apache 2.0",
    "GNU General Public License v2.0": "GPLv2",
    "Mozilla Public License 2.0": "MPL 2.0",
    "GNU Lesser General Public License v3.0": "LGPL v3",
    "GNU Affero General Public License v3.0": "AGPL v3",
    'BSD 3-Clause "New" or "Revised" License': "BSD-3-Clause",
    "No License": "No License",
    "Other": "Other",
}


def normalize_license(name):
    if not name:
        return "No License"
    cleaned = str(name).strip()
    return LICENSE_MAP.get(cleaned, cleaned)


top100_total = 0
for platform in platforms:
    top100 = load_top100(platform)
    top100_total += len(top100)
    for plugin in top100:
        stats = plugin.get("githubStats") or {}
        language = stats.get("language") or plugin.get("language") or "Unknown"
        license_raw = stats.get("license") or plugin.get("license") or "No License"
        license_name = normalize_license(license_raw)
        language_counts[language] += 1
        license_counts[license_name] += 1

languages = language_counts.most_common(10)
licenses = license_counts.most_common(10)

rows = []
for idx in range(10):
    lang_name, lang_count = languages[idx] if idx < len(languages) else ("", 0)
    lic_name, lic_count = licenses[idx] if idx < len(licenses) else ("", 0)
    rows.append([
        lang_name,
        lang_count,
        (lang_count / top100_total) * 100 if top100_total else 0,
        lic_name,
        lic_count,
        (lic_count / top100_total) * 100 if top100_total else 0,
    ])

df = pd.DataFrame(rows, columns=["Language", "Lang Count", "Lang %", "License", "Lic Count", "Lic %"])

df["Lang Count"] = df["Lang Count"].map(lambda x: f"{int(x)}")
df["Lang %"] = df["Lang %"].map(lambda x: f"{x:.1f}")
df["Lic Count"] = df["Lic Count"].map(lambda x: f"{int(x)}")
df["Lic %"] = df["Lic %"].map(lambda x: f"{x:.1f}")

# Match table column names (duplicate Count and % columns)
df = df.rename(columns={
    "Lang Count": "Count",
    "Lang %": "%",
    "Lic Count": "Count",
    "Lic %": "%",
})

df


Unnamed: 0,Language,Count,%,License,Count.1,%.1
0,TypeScript,258,28.7,MIT,341,37.9
1,JavaScript,186,20.7,No License,145,16.1
2,Java,152,16.9,Other,98,10.9
3,Python,89,9.9,GPLv3,90,10.0
4,PHP,89,9.9,Apache 2.0,72,8.0
5,Unknown,51,5.7,GPLv2,45,5.0
6,Kotlin,34,3.8,MPL 2.0,30,3.3
7,CSS,7,0.8,LGPL v3,27,3.0
8,Dockerfile,6,0.7,AGPL v3,13,1.4
9,HTML,5,0.6,BSD-3-Clause,12,1.3
