In [21]:
# === Bootstrap cell: shared config, paths, JSON I/O, and GitHub REST/GraphQL helpers ===

import os, json, time, re, random, pathlib
from typing import Dict, Any, Optional, Tuple, List
from datetime import datetime, timezone

import requests
import yaml
from dotenv import load_dotenv

# -------------------------------------------------------------------
# Locate repo root reliably (works no matter where you open the .ipynb)
# -------------------------------------------------------------------
def _find_repo_root(start: pathlib.Path) -> pathlib.Path:
    cur = start.resolve()
    for _ in range(8):  # walk up to 8 levels
        if (cur / "config" / "config.yaml").exists():
            return cur
        cur = cur.parent
    raise FileNotFoundError("Couldn't locate repo root (no config/config.yaml found upward).")

REPO_ROOT = _find_repo_root(pathlib.Path.cwd())
CONFIG_PATH = REPO_ROOT / "config" / "config.yaml"

# -------------------------------------------------------------------
# Load config + token
# -------------------------------------------------------------------
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    CFG: Dict[str, Any] = yaml.safe_load(f) or {}

load_dotenv(REPO_ROOT / ".env")  # local only; do not commit .env
TOKEN_ENV = (CFG.get("github_token_env") or "GITHUB_TOKEN").strip()
GITHUB_TOKEN = os.getenv(TOKEN_ENV, "").strip()
if not GITHUB_TOKEN:
    raise RuntimeError(f"Missing token in environment variable {TOKEN_ENV}. "
                       f"Create .env from .env.example and set {TOKEN_ENV}=...")

# Repo + output root
REPO = CFG["repo"]  # "owner/name"
OWNER, NAME = REPO.split("/", 1)
OUT_ROOT = pathlib.Path(CFG.get("out_root") or NAME)  # "<name>" if null

# Behavior
OVERWRITE = bool(CFG.get("overwrite", True))
VERBOSE = bool(CFG.get("verbose_logs", True))

# Networking knobs
REQ_TIMEOUT = int(CFG.get("request_timeout_sec", 30))
MAX_RETRIES = int(CFG.get("max_retries", 4))
BACKOFF_BASE_MS = int(CFG.get("backoff_base_ms", 400))
BACKOFF_JITTER_MS = int(CFG.get("backoff_jitter_ms", 250))
RESPECT_RL = bool(CFG.get("respect_rate_limits", True))

# -------------------------------------------------------------------
# Tiny logging helpers
# -------------------------------------------------------------------
def log(msg: str) -> None:
    if VERBOSE:
        print(msg, flush=True)

def warn(msg: str) -> None:
    print(f"⚠️  {msg}", flush=True)

print("Using repo root:", REPO_ROOT)

# -------------------------------------------------------------------
# Canonical output layout helpers
# -------------------------------------------------------------------
def ensure_dir(p: pathlib.Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def repo_root() -> pathlib.Path:
    root = OUT_ROOT
    ensure_dir(root)
    return root

def tags_all_json() -> pathlib.Path:
    d = repo_root() / "tags"
    ensure_dir(d)
    return d / "tags.all.json"

def series_dir(kind: str, series: str) -> pathlib.Path:
    d = repo_root() / kind / series
    ensure_dir(d)
    return d

def pair_stem(base: str, compare: str) -> str:
    return f"{base}...{compare}"

def pair_json(series: str, stem: str, kind: str) -> pathlib.Path:
    # kind ∈ {"compare","commits","pulls","issues"}
    return series_dir(kind, series) / f"{stem}.{kind}.json"

def capsule_json(series: str, stem: str) -> pathlib.Path:
    return series_dir("commits_pr_issue", series) / f"{stem}.tarce_artifacts.json"

# -------------------------------------------------------------------
# JSON I/O
# -------------------------------------------------------------------
def read_json(path: pathlib.Path) -> Optional[Dict[str, Any]]:
    if not path.exists():
        return None
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def write_json(path: pathlib.Path, payload: Dict[str, Any]) -> None:
    ensure_dir(path.parent)
    if path.exists() and not OVERWRITE:
        log(f"Skip (exists): {path}")
        return
    tmp = path.with_suffix(path.suffix + ".tmp")
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    tmp.replace(path)
    log(f"✓ Wrote {path}")

# -------------------------------------------------------------------
# GitHub HTTP session + rate-limit aware helpers
# -------------------------------------------------------------------
SESSION = requests.Session()
SESSION.headers.update({
    "Authorization": f"Bearer {GITHUB_TOKEN}",
    "Accept": "application/vnd.github+json",
    "X-GitHub-Api-Version": "2022-11-28",
    "User-Agent": f"notebook-pipeline/{NAME}"
})

def _parse_reset_epoch(headers: Dict[str, Any]) -> Optional[int]:
    try:
        return int(headers.get("X-RateLimit-Reset") or headers.get("x-ratelimit-reset"))
    except Exception:
        return None

def _maybe_sleep_for_reset(resp: requests.Response) -> None:
    if not RESPECT_RL:
        return
    remaining = resp.headers.get("X-RateLimit-Remaining") or resp.headers.get("x-ratelimit-remaining")
    if remaining is not None and str(remaining).isdigit() and int(remaining) <= 0:
        reset_epoch = _parse_reset_epoch(resp.headers)
        if reset_epoch:
            now = int(time.time())
            delta = max(0, reset_epoch - now) + 1
            warn(f"Rate limit reached. Sleeping ~{delta}s until reset …")
            time.sleep(delta)

def _backoff_sleep(i: int) -> None:
    base = BACKOFF_BASE_MS / 1000.0
    jitter = random.uniform(0, BACKOFF_JITTER_MS / 1000.0)
    time.sleep((2 ** i) * base + jitter)

def rest_get_json(url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """
    Generic GET with backoff + rate-limit handling.
    """
    last_err = None
    for i in range(MAX_RETRIES + 1):
        try:
            r = SESSION.get(url, params=params, timeout=REQ_TIMEOUT)
            # Honor 429 Retry-After if present
            if r.status_code == 429:
                ra = r.headers.get("Retry-After")
                if ra and ra.isdigit():
                    wait_s = int(ra)
                    warn(f"429 received. Sleeping {wait_s}s per Retry-After …")
                    time.sleep(wait_s)
                    continue
            if r.status_code >= 500:
                last_err = f"{r.status_code} {r.text[:200]}"
                _backoff_sleep(i)
                continue
            if r.status_code >= 400:
                raise RuntimeError(f"HTTP {r.status_code}: {r.text[:500]}")
            _maybe_sleep_for_reset(r)
            return r.json()
        except Exception as e:
            last_err = str(e)
            _backoff_sleep(i)
    raise RuntimeError(f"GET failed after retries: {url} :: {last_err}")

def gh_graphql(query: str, variables: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """
    GitHub GraphQL POST with backoff + rate-limit handling.
    Endpoint: https://api.github.com/graphql
    """
    url = "https://api.github.com/graphql"
    payload = {"query": query, "variables": variables or {}}
    last_err = None
    for i in range(MAX_RETRIES + 1):
        try:
            r = SESSION.post(url, json=payload, timeout=REQ_TIMEOUT)
            if r.status_code == 429:
                ra = r.headers.get("Retry-After")
                if ra and ra.isdigit():
                    wait_s = int(ra)
                    warn(f"429 received. Sleeping {wait_s}s per Retry-After …")
                    time.sleep(wait_s)
                    continue
            if r.status_code >= 500:
                last_err = f"{r.status_code} {r.text[:200]}"
                _backoff_sleep(i)
                continue
            if r.status_code >= 400:
                raise RuntimeError(f"GraphQL HTTP {r.status_code}: {r.text[:500]}")
            _maybe_sleep_for_reset(r)
            data = r.json()
            if "errors" in data:
                last_err = f"GraphQL errors: {data['errors']}"
                _backoff_sleep(i)
                continue
            return data
        except Exception as e:
            last_err = str(e)
            _backoff_sleep(i)
    raise RuntimeError(f"GraphQL failed after retries: {last_err}")

def rate_limit_snapshot() -> Dict[str, Any]:
    """
    Returns current REST rate-limit bucket (printed; not written to JSON outputs).
    """
    try:
        info = rest_get_json("https://api.github.com/rate_limit")
        core = info.get("resources", {}).get("core", {})
        remaining = core.get("remaining")
        limit = core.get("limit")
        reset = core.get("reset")
        when = datetime.fromtimestamp(reset, tz=timezone.utc).isoformat() if reset else None
        snap = {"limit": limit, "remaining": remaining, "reset_epoch": reset, "reset_iso": when}
        log(f"Rate limit: {remaining}/{limit}, resets at {when}")
        return snap
    except Exception as e:
        warn(f"Rate limit snapshot failed: {e}")
        return {}

# -------------------------------------------------------------------
# Light helpers for series + ordering
# -------------------------------------------------------------------
def semver_series(tag_name: str) -> str:
    """
    Extract 'vX.Y' series from tags like 'v4.2.3', '4.2.0-rc.1', etc.
    Falls back to 'v0.0' if not parseable.
    """
    m = re.search(r'v?(\d+)\.(\d+)', tag_name or "")
    if not m:
        return "v0.0"
    return f"v{int(m.group(1))}.{int(m.group(2))}"

def sorted_pairs_by_tag_time(pairs: List[Tuple[str, str]], tag_index: Dict[str, Dict[str, Any]]) -> List[Tuple[str, str]]:
    """
    Sort (base, compare) by tag timestamps ascending when both are tags;
    unknown timestamps sort last.
    """
    def ts(tag: str) -> float:
        rec = tag_index.get(tag)
        if rec and rec.get("tag_timestamp"):
            try:
                return datetime.fromisoformat(rec["tag_timestamp"].replace("Z", "+00:00")).timestamp()
            except Exception:
                pass
        return float("inf")
    return sorted(pairs, key=lambda bc: (ts(bc[0]), ts(bc[1])))

log("Bootstrap ready ✓")


Using repo root: /Users/rambodparsi/Desktop/OSS Repository Selection/web_scrapper/commit_pr_issue_analysis
Bootstrap ready ✓


### Config echo

In [22]:
print("Repo:", REPO)
print("Output root:", repo_root())
print("Overwrite:", OVERWRITE, "| Verbose:", VERBOSE)
rate_limit_snapshot()


Repo: mastodon/mastodon
Output root: mastodon
Overwrite: True | Verbose: True
Rate limit: 4981/5000, resets at 2025-10-26T14:16:07+00:00


{'limit': 5000,
 'remaining': 4981,
 'reset_epoch': 1761488167,
 'reset_iso': '2025-10-26T14:16:07+00:00'}

### Helpers for PR lookup

In [23]:
# --- helpers (existing + new verifier bits) ---

import re
from typing import Dict, Any, List, Tuple, Optional

PR_REGEX = re.compile(r"\(#(\d+)\)")

def get_commit_message(owner: str, name: str, sha: str) -> Tuple[str, str]:
    url = f"https://api.github.com/repos/{owner}/{name}/commits/{sha}"
    c = rest_get_json(url)
    meta = c.get("commit") or {}
    msg = meta.get("message") or ""
    headline = msg.splitlines()[0] if msg else ""
    return headline, msg
# Docs: Get a commit (REST). :contentReference[oaicite:2]{index=2}

def prs_via_rest_associated(owner: str, name: str, sha: str) -> List[Dict[str, Any]]:
    url = f"https://api.github.com/repos/{owner}/{name}/commits/{sha}/pulls"
    arr = rest_get_json(url)
    out = []
    for pr in arr or []:
        num = pr.get("number")
        if not num: 
            continue
        out.append({
            "number": int(num),
            "pr_link": pr.get("html_url"),
            "api_url": pr.get("url"),
            "sources": ["rest_associated"]
        })
    return out
# Docs: List pull requests associated with a commit. :contentReference[oaicite:3]{index=3}

def prs_via_graphql_associated(owner: str, name: str, sha: str) -> List[Dict[str, Any]]:
    query = """
    query($owner:String!, $name:String!, $oid:GitObjectID!, $after:String) {
      repository(owner:$owner, name:$name) {
        object(oid:$oid) {
          ... on Commit {
            associatedPullRequests(first: 100, after: $after) {
              pageInfo { hasNextPage endCursor }
              nodes { number url }
            }
          }
        }
      }
    }"""
    vars = {"owner": owner, "name": name, "oid": sha, "after": None}
    out = []
    while True:
        data = gh_graphql(query, vars)
        repo = (data.get("data") or {}).get("repository") or {}
        obj = repo.get("object") or {}
        apr = (obj.get("associatedPullRequests") or {})
        for n in apr.get("nodes") or []:
            num = n.get("number")
            if num:
                out.append({
                    "number": int(num),
                    "pr_link": n.get("url"),
                    "api_url": f"https://api.github.com/repos/{owner}/{name}/pulls/{num}",
                    "sources": ["gql_associated"]
                })
        pi = (apr.get("pageInfo") or {})
        if not pi.get("hasNextPage"):
            break
        vars["after"] = pi.get("endCursor")
    return out
# Docs: GraphQL Commit.associatedPullRequests. :contentReference[oaicite:4]{index=4}

def pr_refs_from_message(message: str) -> List[Dict[str, Any]]:
    out = []
    for m in PR_REGEX.finditer(message or ""):
        try:
            n = int(m.group(1))
            out.append({
                "number": n,
                "pr_link": f"https://github.com/{OWNER}/{NAME}/pull/{n}",
                "api_url": f"https://api.github.com/repos/{OWNER}/{NAME}/pulls/{n}",
                "sources": ["message_ref"]
            })
        except Exception:
            pass
    return out

def merge_pr_refs(*lists: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    seen = set()
    merged = []
    for L in lists:
        for r in L or []:
            n = r.get("number")
            if n in seen:
                for m in merged:
                    if m.get("number") == n:
                        have = set(m.get("sources") or [])
                        for s in (r.get("sources") or []):
                            if s not in have:
                                m["sources"].append(s)
                        break
            else:
                seen.add(n)
                merged.append({**r, "sources": list(r.get("sources") or [])})
    return merged

def is_bump_like_title(title: str) -> bool:
    if not title:
        return False
    t = title.lower()
    if any(k in t for k in ["bump", "release", "version"]):
        return bool(re.search(r'v?\d+\.\d+(\.\d+)?', t))
    return False

# ---------- NEW: verification helpers ----------
def pr_commits_contains_sha(owner: str, name: str, pr_number: int, sha: str) -> bool:
    """
    GET /repos/{owner}/{repo}/pulls/{number}/commits (caps at ~250). If 'sha' is in the list → verified.
    """
    url = f"https://api.github.com/repos/{owner}/{name}/pulls/{pr_number}/commits"
    try:
        commits = rest_get_json(url)  # API returns an array
        return any((c.get("sha") == sha) for c in (commits or []))
    except Exception as e:
        warn(f"Verifier pr_commits failed for PR #{pr_number}: {e}")
        return False
# Docs: List commits on a pull request; note 250-item cap. :contentReference[oaicite:5]{index=5}

def pr_merge_commit_in_compare(owner: str, name: str, pr_number: int, compare_shas: set) -> Tuple[bool, Optional[str]]:
    """
    GET the PR to read merge_commit_sha; if that sha is in the compare set, treat as verified (squash/merge).
    Returns (verified, merge_sha_or_None).
    """
    url = f"https://api.github.com/repos/{owner}/{name}/pulls/{pr_number}"
    try:
        pr = rest_get_json(url)
        msha = pr.get("merge_commit_sha")
        if msha and (msha in compare_shas):
            return True, msha
        return False, msha
    except Exception as e:
        warn(f"Verifier merge_commit_sha failed for PR #{pr_number}: {e}")
        return False, None
# Docs: merge_commit_sha semantics before/after merge. :contentReference[oaicite:6]{index=6}


### Load planned pairs

In [24]:
from pathlib import Path

print("REPO:", REPO)
print("repo_root():", repo_root())

pairs_base = Path(repo_root()) / "compare"
print("looking in:", pairs_base)

pairs_files = []
if pairs_base.exists():
    pairs_files = sorted(pairs_base.rglob("*.compare.json"))

# Broad fallback if someone ran with a different out_root previously
if not pairs_files:
    warn("No files under <name>/compare. Falling back to a broad search for */compare/**/*.compare.json …")
    pairs_files = sorted(Path(REPO_ROOT).rglob("compare/**/*.compare.json"))

print("found BEFORE whitelist:", len(pairs_files))
found_series = sorted({p.parent.name for p in pairs_files})
print("series present on disk:", found_series)

SERIES_WHITELIST = CFG.get("series_whitelist") or []
if SERIES_WHITELIST:
    before = len(pairs_files)
    pairs_files = [p for p in pairs_files if p.parent.name in set(SERIES_WHITELIST)]
    after = len(pairs_files)
    print(f"applied whitelist {SERIES_WHITELIST} → {after}/{before} remain")

print("sample:")
for p in pairs_files[:10]:
    print(" -", p)


REPO: mastodon/mastodon
repo_root(): mastodon
looking in: mastodon/compare
found BEFORE whitelist: 107
series present on disk: ['v4.0', 'v4.1', 'v4.2', 'v4.3', 'v4.4']
applied whitelist ['v4.0'] → 19/107 remain
sample:
 - mastodon/compare/v4.0/v4.0.0...v4.0.1.compare.json
 - mastodon/compare/v4.0/v4.0.0rc1...v4.0.0rc2.compare.json
 - mastodon/compare/v4.0/v4.0.0rc2...v4.0.0rc3.compare.json
 - mastodon/compare/v4.0/v4.0.0rc3...v4.0.0rc4.compare.json
 - mastodon/compare/v4.0/v4.0.0rc4...v4.0.0.compare.json
 - mastodon/compare/v4.0/v4.0.1...v4.0.2.compare.json
 - mastodon/compare/v4.0/v4.0.10...v4.0.11.compare.json
 - mastodon/compare/v4.0/v4.0.11...v4.0.12.compare.json
 - mastodon/compare/v4.0/v4.0.12...v4.0.13.compare.json
 - mastodon/compare/v4.0/v4.0.13...v4.0.14.compare.json


### build the *.commits.json

In [25]:
written = 0
skipped = 0

for cmp_path in pairs_files:
    pair_payload = read_json(cmp_path)
    if not pair_payload:
        continue

    series = cmp_path.parent.name
    base = pair_payload.get("base")
    comp = pair_payload.get("compare")
    stem = pair_stem(base, comp)
    out_path = pair_json(series, stem, "commits")

    if out_path.exists() and not OVERWRITE:
        log(f"Skip (exists): {out_path}")
        skipped += 1
        continue

    compare_commits = pair_payload.get("commits") or []
    compare_sha_set = {c.get("sha") for c in compare_commits if c.get("sha")}

    items = []

    # Pass 1 — collect raw refs and titles; build PR frequency
    pr_freq = {}
    titles = {}
    raw_refs = {}

    for c in compare_commits:
        sha = c.get("sha")
        if not sha:
            continue

        title, full_msg = get_commit_message(OWNER, NAME, sha)
        titles[sha] = title

        msg_refs = pr_refs_from_message(full_msg)
        rest_refs = prs_via_rest_associated(OWNER, NAME, sha)
        gql_refs  = [] if rest_refs else prs_via_graphql_associated(OWNER, NAME, sha)

        merged = merge_pr_refs(rest_refs, gql_refs, msg_refs)
        raw_refs[sha] = merged

        for r in merged:
            n = r.get("number")
            if n is not None:
                pr_freq[n] = pr_freq.get(n, 0) + 1

    # Detect “release-like” PRs that show up on many commits
    many_threshold = max(5, int(0.2 * max(1, len(compare_commits))))
    release_like_prs = {n for (n, f) in pr_freq.items() if f >= many_threshold}

    # Pass 2 — verify and suppress as configured
    for c in compare_commits:
        sha = c.get("sha")
        title = titles.get(sha, "")
        refs = raw_refs.get(sha, [])

        # First, optional verification for each PR ref
        verified_refs = []
        for r in refs:
            n = r.get("number")
            meta = {"verified": False, "verification_method": "none"}
            if isinstance(n, int):
                # 1) check pr_commits
                if pr_commits_contains_sha(OWNER, NAME, n, sha):
                    meta = {"verified": True, "verification_method": "pr_commits"}
                else:
                    # 2) check merge_commit_sha in compare range (squash/merge)
                    ok, msha = pr_merge_commit_in_compare(OWNER, NAME, n, compare_sha_set)
                    if ok:
                        meta = {"verified": True, "verification_method": "merge_commit_in_range"}
                r = {**r, "meta": meta}
            else:
                r = {**r, "meta": meta}
            verified_refs.append(r)

        # Apply the release-like suppression (unchanged)
        if is_bump_like_title(title):
            keep = verified_refs
        else:
            keep = [r for r in verified_refs if r.get("number") not in release_like_prs]

        items.append({
            "sha": sha,
            "commit_title": title,
            "pr_refs": keep
        })

    payload = {
        "repo": REPO,
        "base": base,
        "compare": comp,
        "series": series,
        "items": items,
        "pair_annotations": {
            "release_like_prs": sorted(list(release_like_prs)),
            "release_like_rule": {"min_hits": many_threshold, "percent_of_commits": 0.2}
        }
    }
    write_json(out_path, payload)

    written += 1

print(f"Done. Written: {written} | Skipped: {skipped}")
rate_limit_snapshot()


✓ Wrote mastodon/commits/v4.0/v4.0.0...v4.0.1.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.0rc1...v4.0.0rc2.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.0rc2...v4.0.0rc3.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.0rc3...v4.0.0rc4.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.0rc4...v4.0.0.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.1...v4.0.2.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.10...v4.0.11.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.11...v4.0.12.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.12...v4.0.13.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.13...v4.0.14.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.14...v4.0.15.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.2...v4.0.3.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.3...v4.0.4.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.4...v4.0.5.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.5...v4.0.6.commits.json
✓ Wrote mastodon/commits/v4.0/v4.0.6...v4.0.7.commits.json
✓ Wrote mastodon/commits/

{'limit': 5000,
 'remaining': 3791,
 'reset_epoch': 1761488167,
 'reset_iso': '2025-10-26T14:16:07+00:00'}