In [31]:
# === Bootstrap cell: shared config, paths, JSON I/O, and GitHub REST/GraphQL helpers ===

import os, json, time, re, random, pathlib
from typing import Dict, Any, Optional, Tuple, List
from datetime import datetime, timezone

import requests
import yaml
from dotenv import load_dotenv

# -------------------------------------------------------------------
# Locate repo root reliably (works no matter where you open the .ipynb)
# -------------------------------------------------------------------
def _find_repo_root(start: pathlib.Path) -> pathlib.Path:
    cur = start.resolve()
    for _ in range(8):  # walk up to 8 levels
        if (cur / "config" / "config.yaml").exists():
            return cur
        cur = cur.parent
    raise FileNotFoundError("Couldn't locate repo root (no config/config.yaml found upward).")

REPO_ROOT = _find_repo_root(pathlib.Path.cwd())
CONFIG_PATH = REPO_ROOT / "config" / "config.yaml"

# -------------------------------------------------------------------
# Load config + token
# -------------------------------------------------------------------
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    CFG: Dict[str, Any] = yaml.safe_load(f) or {}

load_dotenv(REPO_ROOT / ".env")  # local only; do not commit .env
TOKEN_ENV = (CFG.get("github_token_env") or "GITHUB_TOKEN").strip()
GITHUB_TOKEN = os.getenv(TOKEN_ENV, "").strip()
if not GITHUB_TOKEN:
    raise RuntimeError(f"Missing token in environment variable {TOKEN_ENV}. "
                       f"Create .env from .env.example and set {TOKEN_ENV}=...")

# Repo + output root
REPO = CFG["repo"]  # "owner/name"
OWNER, NAME = REPO.split("/", 1)
OUT_ROOT = pathlib.Path(CFG.get("out_root") or NAME)  # "<name>" if null

# Behavior
OVERWRITE = bool(CFG.get("overwrite", True))
VERBOSE = bool(CFG.get("verbose_logs", True))

# Networking knobs
REQ_TIMEOUT = int(CFG.get("request_timeout_sec", 30))
MAX_RETRIES = int(CFG.get("max_retries", 4))
BACKOFF_BASE_MS = int(CFG.get("backoff_base_ms", 400))
BACKOFF_JITTER_MS = int(CFG.get("backoff_jitter_ms", 250))
RESPECT_RL = bool(CFG.get("respect_rate_limits", True))

# -------------------------------------------------------------------
# Tiny logging helpers
# -------------------------------------------------------------------
def log(msg: str) -> None:
    if VERBOSE:
        print(msg, flush=True)

def warn(msg: str) -> None:
    print(f"⚠️  {msg}", flush=True)

print("Using repo root:", REPO_ROOT)

# -------------------------------------------------------------------
# Canonical output layout helpers
# -------------------------------------------------------------------
def ensure_dir(p: pathlib.Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def repo_root() -> pathlib.Path:
    root = OUT_ROOT
    ensure_dir(root)
    return root

def tags_all_json() -> pathlib.Path:
    d = repo_root() / "tags"
    ensure_dir(d)
    return d / "tags.all.json"

def series_dir(kind: str, series: str) -> pathlib.Path:
    d = repo_root() / kind / series
    ensure_dir(d)
    return d

def pair_stem(base: str, compare: str) -> str:
    return f"{base}...{compare}"

def pair_json(series: str, stem: str, kind: str) -> pathlib.Path:
    # kind ∈ {"compare","commits","pulls","issues"}
    return series_dir(kind, series) / f"{stem}.{kind}.json"

def capsule_json(series: str, stem: str) -> pathlib.Path:
    return series_dir("commits_pr_issue", series) / f"{stem}.tarce_artifacts.json"

# -------------------------------------------------------------------
# JSON I/O
# -------------------------------------------------------------------
def read_json(path: pathlib.Path) -> Optional[Dict[str, Any]]:
    if not path.exists():
        return None
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def write_json(path: pathlib.Path, payload: Dict[str, Any]) -> None:
    ensure_dir(path.parent)
    if path.exists() and not OVERWRITE:
        log(f"Skip (exists): {path}")
        return
    tmp = path.with_suffix(path.suffix + ".tmp")
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    tmp.replace(path)
    log(f"✓ Wrote {path}")

# -------------------------------------------------------------------
# GitHub HTTP session + rate-limit aware helpers
# -------------------------------------------------------------------
SESSION = requests.Session()
SESSION.headers.update({
    "Authorization": f"Bearer {GITHUB_TOKEN}",
    "Accept": "application/vnd.github+json",
    "X-GitHub-Api-Version": "2022-11-28",
    "User-Agent": f"notebook-pipeline/{NAME}"
})

def _parse_reset_epoch(headers: Dict[str, Any]) -> Optional[int]:
    try:
        return int(headers.get("X-RateLimit-Reset") or headers.get("x-ratelimit-reset"))
    except Exception:
        return None

def _maybe_sleep_for_reset(resp: requests.Response) -> None:
    if not RESPECT_RL:
        return
    remaining = resp.headers.get("X-RateLimit-Remaining") or resp.headers.get("x-ratelimit-remaining")
    if remaining is not None and str(remaining).isdigit() and int(remaining) <= 0:
        reset_epoch = _parse_reset_epoch(resp.headers)
        if reset_epoch:
            now = int(time.time())
            delta = max(0, reset_epoch - now) + 1
            warn(f"Rate limit reached. Sleeping ~{delta}s until reset …")
            time.sleep(delta)

def _backoff_sleep(i: int) -> None:
    base = BACKOFF_BASE_MS / 1000.0
    jitter = random.uniform(0, BACKOFF_JITTER_MS / 1000.0)
    time.sleep((2 ** i) * base + jitter)

def rest_get_json(url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """
    Generic GET with backoff + rate-limit handling.
    """
    last_err = None
    for i in range(MAX_RETRIES + 1):
        try:
            r = SESSION.get(url, params=params, timeout=REQ_TIMEOUT)
            # Honor 429 Retry-After if present
            if r.status_code == 429:
                ra = r.headers.get("Retry-After")
                if ra and ra.isdigit():
                    wait_s = int(ra)
                    warn(f"429 received. Sleeping {wait_s}s per Retry-After …")
                    time.sleep(wait_s)
                    continue
            if r.status_code >= 500:
                last_err = f"{r.status_code} {r.text[:200]}"
                _backoff_sleep(i)
                continue
            if r.status_code >= 400:
                raise RuntimeError(f"HTTP {r.status_code}: {r.text[:500]}")
            _maybe_sleep_for_reset(r)
            return r.json()
        except Exception as e:
            last_err = str(e)
            _backoff_sleep(i)
    raise RuntimeError(f"GET failed after retries: {url} :: {last_err}")

def gh_graphql(query: str, variables: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """
    GitHub GraphQL POST with backoff + rate-limit handling.
    Endpoint: https://api.github.com/graphql
    """
    url = "https://api.github.com/graphql"
    payload = {"query": query, "variables": variables or {}}
    last_err = None
    for i in range(MAX_RETRIES + 1):
        try:
            r = SESSION.post(url, json=payload, timeout=REQ_TIMEOUT)
            if r.status_code == 429:
                ra = r.headers.get("Retry-After")
                if ra and ra.isdigit():
                    wait_s = int(ra)
                    warn(f"429 received. Sleeping {wait_s}s per Retry-After …")
                    time.sleep(wait_s)
                    continue
            if r.status_code >= 500:
                last_err = f"{r.status_code} {r.text[:200]}"
                _backoff_sleep(i)
                continue
            if r.status_code >= 400:
                raise RuntimeError(f"GraphQL HTTP {r.status_code}: {r.text[:500]}")
            _maybe_sleep_for_reset(r)
            data = r.json()
            if "errors" in data:
                last_err = f"GraphQL errors: {data['errors']}"
                _backoff_sleep(i)
                continue
            return data
        except Exception as e:
            last_err = str(e)
            _backoff_sleep(i)
    raise RuntimeError(f"GraphQL failed after retries: {last_err}")

def rate_limit_snapshot() -> Dict[str, Any]:
    """
    Returns current REST rate-limit bucket (printed; not written to JSON outputs).
    """
    try:
        info = rest_get_json("https://api.github.com/rate_limit")
        core = info.get("resources", {}).get("core", {})
        remaining = core.get("remaining")
        limit = core.get("limit")
        reset = core.get("reset")
        when = datetime.fromtimestamp(reset, tz=timezone.utc).isoformat() if reset else None
        snap = {"limit": limit, "remaining": remaining, "reset_epoch": reset, "reset_iso": when}
        log(f"Rate limit: {remaining}/{limit}, resets at {when}")
        return snap
    except Exception as e:
        warn(f"Rate limit snapshot failed: {e}")
        return {}

# -------------------------------------------------------------------
# Light helpers for series + ordering
# -------------------------------------------------------------------
def semver_series(tag_name: str) -> str:
    """
    Extract 'vX.Y' series from tags like 'v4.2.3', '4.2.0-rc.1', etc.
    Falls back to 'v0.0' if not parseable.
    """
    m = re.search(r'v?(\d+)\.(\d+)', tag_name or "")
    if not m:
        return "v0.0"
    return f"v{int(m.group(1))}.{int(m.group(2))}"

def sorted_pairs_by_tag_time(pairs: List[Tuple[str, str]], tag_index: Dict[str, Dict[str, Any]]) -> List[Tuple[str, str]]:
    """
    Sort (base, compare) by tag timestamps ascending when both are tags;
    unknown timestamps sort last.
    """
    def ts(tag: str) -> float:
        rec = tag_index.get(tag)
        if rec and rec.get("tag_timestamp"):
            try:
                return datetime.fromisoformat(rec["tag_timestamp"].replace("Z", "+00:00")).timestamp()
            except Exception:
                pass
        return float("inf")
    return sorted(pairs, key=lambda bc: (ts(bc[0]), ts(bc[1])))

log("Bootstrap ready ✓")


Using repo root: /Users/rambodparsi/Desktop/OSS Repository Selection/web_scrapper/commit_pr_issue_analysis
Bootstrap ready ✓


### Config echo

In [32]:
print("Repo:", REPO)
print("Output root:", repo_root())
print("Pairs mode:", CFG.get("pairs_mode"))
print("Series whitelist:", CFG.get("series_whitelist"))
print("Explicit pairs:", CFG.get("explicit_pairs"))
print("Optional branch:", CFG.get("optional_branch"))
rate_limit_snapshot()


Repo: mastodon/mastodon
Output root: mastodon
Pairs mode: series
Series whitelist: ['v4.0']
Explicit pairs: []
Optional branch: None
Rate limit: 5000/5000, resets at 2025-10-26T14:16:07+00:00


{'limit': 5000,
 'remaining': 5000,
 'reset_epoch': 1761488167,
 'reset_iso': '2025-10-26T14:16:07+00:00'}

### Load tags and build a tag index

In [33]:
from collections import defaultdict

# Load tags
tags_payload = read_json(tags_all_json())
if not tags_payload or not tags_payload.get("tags"):
    raise RuntimeError("tags.all.json not found or empty. Run m1_fetch_tags.ipynb first.")

all_tags = tags_payload["tags"]

# Index by tag name
tag_by_name = {t["name"]: t for t in all_tags}

# Group tags by series and sort by timestamp (old → new)
series_tags = defaultdict(list)
for t in all_tags:
    series_tags[t["series"]].append(t)

def ts_key(x):
    ts = x.get("tag_timestamp")
    if not ts:
        return float("inf")
    from datetime import datetime
    try:
        return datetime.fromisoformat(ts.replace("Z","+00:00")).timestamp()
    except Exception:
        return float("inf")

for s in series_tags:
    series_tags[s].sort(key=ts_key)

print("Series discovered:", sorted(series_tags.keys()))


Series discovered: ['v0.1', 'v0.6', 'v0.7', 'v0.8', 'v0.9', 'v1.0', 'v1.1', 'v1.2', 'v1.3', 'v1.4', 'v1.5', 'v1.6', 'v2.0', 'v2.1', 'v2.2', 'v2.3', 'v2.4', 'v2.5', 'v2.6', 'v2.7', 'v2.8', 'v2.9', 'v3.0', 'v3.1', 'v3.2', 'v3.3', 'v3.4', 'v3.5', 'v4.0', 'v4.1', 'v4.2', 'v4.3', 'v4.4', 'v4.5']


### Build pairs according to config

In [34]:
PAIRS_MODE = (CFG.get("pairs_mode") or "series").lower()
SERIES_WHITELIST = CFG.get("series_whitelist") or []
EXPLICIT_PAIRS = CFG.get("explicit_pairs") or []
OPTIONAL_BRANCH = CFG.get("optional_branch")  # e.g., "stable-4.2"

def adjacent_pairs_for_series(series_name: str):
    tags = series_tags.get(series_name, [])
    names = [t["name"] for t in tags]
    pairs = []
    for i in range(len(names)-1):
        pairs.append({"base": names[i], "compare": names[i+1], "series": series_name})
    # Append lastTag → OPTIONAL_BRANCH if configured
    if OPTIONAL_BRANCH:
        if names:
            pairs.append({"base": names[-1], "compare": OPTIONAL_BRANCH, "series": series_name})
    return pairs

pairs_planned = []

if PAIRS_MODE == "all":
    for s in sorted(series_tags.keys()):
        pairs_planned.extend(adjacent_pairs_for_series(s))

elif PAIRS_MODE == "series":
    target = SERIES_WHITELIST if SERIES_WHITELIST else sorted(series_tags.keys())
    for s in target:
        pairs_planned.extend(adjacent_pairs_for_series(s))

elif PAIRS_MODE == "explicit":
    # Keep series inferred from tag_by_name to place files under the correct folder.
    for p in EXPLICIT_PAIRS:
        base = p["base"]; cmp_ = p["compare"]
        # Try to infer series from base tag; fall back to compare; else 'v0.0'
        series_guess = "v0.0"
        if base in tag_by_name:
            series_guess = tag_by_name[base]["series"]
        elif cmp_ in tag_by_name:
            series_guess = tag_by_name[cmp_]["series"]
        pairs_planned.append({"base": base, "compare": cmp_, "series": series_guess})
    # OPTIONAL_BRANCH is typically not used with explicit, so we skip it here on purpose.

else:
    raise RuntimeError(f"Unsupported pairs_mode: {PAIRS_MODE}")

print(f"Planned pairs: {len(pairs_planned)}")
for pp in pairs_planned[:10]:
    print(pp)
if len(pairs_planned) > 10:
    print("…")


Planned pairs: 19
{'base': 'v4.0.0rc1', 'compare': 'v4.0.0rc2', 'series': 'v4.0'}
{'base': 'v4.0.0rc2', 'compare': 'v4.0.0rc3', 'series': 'v4.0'}
{'base': 'v4.0.0rc3', 'compare': 'v4.0.0rc4', 'series': 'v4.0'}
{'base': 'v4.0.0rc4', 'compare': 'v4.0.0', 'series': 'v4.0'}
{'base': 'v4.0.0', 'compare': 'v4.0.1', 'series': 'v4.0'}
{'base': 'v4.0.1', 'compare': 'v4.0.2', 'series': 'v4.0'}
{'base': 'v4.0.2', 'compare': 'v4.0.3', 'series': 'v4.0'}
{'base': 'v4.0.3', 'compare': 'v4.0.4', 'series': 'v4.0'}
{'base': 'v4.0.4', 'compare': 'v4.0.5', 'series': 'v4.0'}
{'base': 'v4.0.5', 'compare': 'v4.0.6', 'series': 'v4.0'}
…


### Compare helper

In [35]:
def ref_for_compare(name_or_ref: str) -> str:
    """
    If this ref matches a known tag name, prefix 'tags/' to force GitHub to
    treat it as a tag when comparing (avoids the branch-vs-tag ambiguity).
    Otherwise, return it unchanged (branch/SHA).
    """
    if name_or_ref in tag_by_name:
        return f"tags/{name_or_ref}"
    return name_or_ref

def compare_url(owner: str, repo: str, base_ref: str, head_ref: str) -> str:
    return f"https://api.github.com/repos/{owner}/{repo}/compare/{base_ref}...{head_ref}"

def to_commit_row(c: dict) -> dict:
    # Map GitHub compare commit object → our flat row
    sha = c.get("sha")
    html_url = c.get("html_url") or f"https://github.com/{OWNER}/{NAME}/commit/{sha}" if sha else None
    api_url = c.get("url")
    author_login = (c.get("author") or {}).get("login")  # may be None if author not linked
    meta = c.get("commit") or {}
    author_meta = (meta.get("author") or {})
    commit_author_date = author_meta.get("date")
    return {
        "sha": sha,
        "html_url": html_url,
        "api_url": api_url,
        "author_login": author_login,
        "commit_author_date": commit_author_date
    }


### Run compares and write <base...compare>.compare.json

In [36]:
written = 0
skipped = 0
errors = 0

for pair in pairs_planned:
    base_raw = pair["base"]
    head_raw = pair["compare"]
    series = pair["series"]

    base = ref_for_compare(base_raw)
    head = ref_for_compare(head_raw)

    stem = pair_stem(base_raw, head_raw)  # filename uses raw names
    out_path = pair_json(series, stem, "compare")

    # Skip if exists and not overwriting
    if out_path.exists() and not OVERWRITE:
        log(f"Skip (exists): {out_path}")
        skipped += 1
        continue

    url = compare_url(OWNER, NAME, base, head)
    try:
        data = rest_get_json(url)
        commits_api = data.get("commits") or []
        commits_rows = [to_commit_row(c) for c in commits_api]

        payload = {
            "repo": REPO,
            "base": base_raw,
            "compare": head_raw,
            "commit_count": len(commits_rows),
            "commits": commits_rows
        }
        write_json(out_path, payload)
        written += 1

    except Exception as e:
        warn(f"Compare failed for {base_raw}...{head_raw}: {e}")
        # Still emit a minimal file to keep the pipeline consistent
        payload = {
            "repo": REPO,
            "base": base_raw,
            "compare": head_raw,
            "commit_count": 0,
            "commits": []
        }
        write_json(out_path, payload)
        errors += 1

print(f"Done. Written: {written} | Skipped: {skipped} | Errors (minimal outputs created): {errors}")
rate_limit_snapshot()


✓ Wrote mastodon/compare/v4.0/v4.0.0rc1...v4.0.0rc2.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.0rc2...v4.0.0rc3.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.0rc3...v4.0.0rc4.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.0rc4...v4.0.0.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.0...v4.0.1.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.1...v4.0.2.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.2...v4.0.3.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.3...v4.0.4.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.4...v4.0.5.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.5...v4.0.6.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.6...v4.0.7.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.7...v4.0.8.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.8...v4.0.9.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.9...v4.0.10.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.10...v4.0.11.compare.json
✓ Wrote mastodon/compare/v4.0/v4.0.11...v4.0.12.compare.json
✓ Wrote mastodon/compare/v4.0/

{'limit': 5000,
 'remaining': 4981,
 'reset_epoch': 1761488167,
 'reset_iso': '2025-10-26T14:16:07+00:00'}