In [39]:
# %pip install -q -U google-genai

In [40]:
# === Bootstrap cell: shared config, paths, JSON I/O, and GitHub REST/GraphQL helpers ===

import os, json, time, re, random, pathlib
from typing import Dict, Any, Optional, Tuple, List
from datetime import datetime, timezone

import requests
import yaml
from dotenv import load_dotenv

# -------------------------------------------------------------------
# Locate repo root reliably (works no matter where you open the .ipynb)
# -------------------------------------------------------------------
def _find_repo_root(start: pathlib.Path) -> pathlib.Path:
    cur = start.resolve()
    for _ in range(8):  # walk up to 8 levels
        if (cur / "config" / "config.yaml").exists():
            return cur
        cur = cur.parent
    raise FileNotFoundError("Couldn't locate repo root (no config/config.yaml found upward).")

REPO_ROOT = _find_repo_root(pathlib.Path.cwd())
CONFIG_PATH = REPO_ROOT / "config" / "config.yaml"

# -------------------------------------------------------------------
# Load config + token
# -------------------------------------------------------------------
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
    CFG: Dict[str, Any] = yaml.safe_load(f) or {}

load_dotenv(REPO_ROOT / ".env")  # local only; do not commit .env
TOKEN_ENV = (CFG.get("github_token_env") or "GITHUB_TOKEN").strip()
GITHUB_TOKEN = os.getenv(TOKEN_ENV, "").strip()
if not GITHUB_TOKEN:
    raise RuntimeError(f"Missing token in environment variable {TOKEN_ENV}. "
                       f"Create .env from .env.example and set {TOKEN_ENV}=...")

# Repo + output root
REPO = CFG["repo"]  # "owner/name"
OWNER, NAME = REPO.split("/", 1)
OUT_ROOT = pathlib.Path(CFG.get("out_root") or NAME)  # "<name>" if null

# Behavior
OVERWRITE = bool(CFG.get("overwrite", True))
VERBOSE = bool(CFG.get("verbose_logs", True))

# Networking knobs
REQ_TIMEOUT = int(CFG.get("request_timeout_sec", 30))
MAX_RETRIES = int(CFG.get("max_retries", 4))
BACKOFF_BASE_MS = int(CFG.get("backoff_base_ms", 400))
BACKOFF_JITTER_MS = int(CFG.get("backoff_jitter_ms", 250))
RESPECT_RL = bool(CFG.get("respect_rate_limits", True))

# -------------------------------------------------------------------
# Tiny logging helpers
# -------------------------------------------------------------------
def log(msg: str) -> None:
    if VERBOSE:
        print(msg, flush=True)

def warn(msg: str) -> None:
    print(f"⚠️  {msg}", flush=True)

print("Using repo root:", REPO_ROOT)

# -------------------------------------------------------------------
# Canonical output layout helpers
# -------------------------------------------------------------------
def ensure_dir(p: pathlib.Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def repo_root() -> pathlib.Path:
    root = OUT_ROOT
    ensure_dir(root)
    return root

def tags_all_json() -> pathlib.Path:
    d = repo_root() / "tags"
    ensure_dir(d)
    return d / "tags.all.json"

def series_dir(kind: str, series: str) -> pathlib.Path:
    d = repo_root() / kind / series
    ensure_dir(d)
    return d

def pair_stem(base: str, compare: str) -> str:
    return f"{base}...{compare}"

def pair_json(series: str, stem: str, kind: str) -> pathlib.Path:
    # kind ∈ {"compare","commits","pulls","issues"}
    return series_dir(kind, series) / f"{stem}.{kind}.json"

def capsule_json(series: str, stem: str) -> pathlib.Path:
    return series_dir("commits_pr_issue", series) / f"{stem}.tarce_artifacts.json"

# -------------------------------------------------------------------
# JSON I/O
# -------------------------------------------------------------------
def read_json(path: pathlib.Path) -> Optional[Dict[str, Any]]:
    if not path.exists():
        return None
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def write_json(path: pathlib.Path, payload: Dict[str, Any]) -> None:
    ensure_dir(path.parent)
    if path.exists() and not OVERWRITE:
        log(f"Skip (exists): {path}")
        return
    tmp = path.with_suffix(path.suffix + ".tmp")
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(payload, f, ensure_ascii=False, indent=2)
    tmp.replace(path)
    log(f"✓ Wrote {path}")

# -------------------------------------------------------------------
# GitHub HTTP session + rate-limit aware helpers
# -------------------------------------------------------------------
SESSION = requests.Session()
SESSION.headers.update({
    "Authorization": f"Bearer {GITHUB_TOKEN}",
    "Accept": "application/vnd.github+json",
    "X-GitHub-Api-Version": "2022-11-28",
    "User-Agent": f"notebook-pipeline/{NAME}"
})

def _parse_reset_epoch(headers: Dict[str, Any]) -> Optional[int]:
    try:
        return int(headers.get("X-RateLimit-Reset") or headers.get("x-ratelimit-reset"))
    except Exception:
        return None

def _maybe_sleep_for_reset(resp: requests.Response) -> None:
    if not RESPECT_RL:
        return
    remaining = resp.headers.get("X-RateLimit-Remaining") or resp.headers.get("x-ratelimit-remaining")
    if remaining is not None and str(remaining).isdigit() and int(remaining) <= 0:
        reset_epoch = _parse_reset_epoch(resp.headers)
        if reset_epoch:
            now = int(time.time())
            delta = max(0, reset_epoch - now) + 1
            warn(f"Rate limit reached. Sleeping ~{delta}s until reset …")
            time.sleep(delta)

def _backoff_sleep(i: int) -> None:
    base = BACKOFF_BASE_MS / 1000.0
    jitter = random.uniform(0, BACKOFF_JITTER_MS / 1000.0)
    time.sleep((2 ** i) * base + jitter)

def rest_get_json(url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """
    Generic GET with backoff + rate-limit handling.
    """
    last_err = None
    for i in range(MAX_RETRIES + 1):
        try:
            r = SESSION.get(url, params=params, timeout=REQ_TIMEOUT)
            # Honor 429 Retry-After if present
            if r.status_code == 429:
                ra = r.headers.get("Retry-After")
                if ra and ra.isdigit():
                    wait_s = int(ra)
                    warn(f"429 received. Sleeping {wait_s}s per Retry-After …")
                    time.sleep(wait_s)
                    continue
            if r.status_code >= 500:
                last_err = f"{r.status_code} {r.text[:200]}"
                _backoff_sleep(i)
                continue
            if r.status_code >= 400:
                raise RuntimeError(f"HTTP {r.status_code}: {r.text[:500]}")
            _maybe_sleep_for_reset(r)
            return r.json()
        except Exception as e:
            last_err = str(e)
            _backoff_sleep(i)
    raise RuntimeError(f"GET failed after retries: {url} :: {last_err}")

def gh_graphql(query: str, variables: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
    """
    GitHub GraphQL POST with backoff + rate-limit handling.
    Endpoint: https://api.github.com/graphql
    """
    url = "https://api.github.com/graphql"
    payload = {"query": query, "variables": variables or {}}
    last_err = None
    for i in range(MAX_RETRIES + 1):
        try:
            r = SESSION.post(url, json=payload, timeout=REQ_TIMEOUT)
            if r.status_code == 429:
                ra = r.headers.get("Retry-After")
                if ra and ra.isdigit():
                    wait_s = int(ra)
                    warn(f"429 received. Sleeping {wait_s}s per Retry-After …")
                    time.sleep(wait_s)
                    continue
            if r.status_code >= 500:
                last_err = f"{r.status_code} {r.text[:200]}"
                _backoff_sleep(i)
                continue
            if r.status_code >= 400:
                raise RuntimeError(f"GraphQL HTTP {r.status_code}: {r.text[:500]}")
            _maybe_sleep_for_reset(r)
            data = r.json()
            if "errors" in data:
                last_err = f"GraphQL errors: {data['errors']}"
                _backoff_sleep(i)
                continue
            return data
        except Exception as e:
            last_err = str(e)
            _backoff_sleep(i)
    raise RuntimeError(f"GraphQL failed after retries: {last_err}")

def rate_limit_snapshot() -> Dict[str, Any]:
    """
    Returns current REST rate-limit bucket (printed; not written to JSON outputs).
    """
    try:
        info = rest_get_json("https://api.github.com/rate_limit")
        core = info.get("resources", {}).get("core", {})
        remaining = core.get("remaining")
        limit = core.get("limit")
        reset = core.get("reset")
        when = datetime.fromtimestamp(reset, tz=timezone.utc).isoformat() if reset else None
        snap = {"limit": limit, "remaining": remaining, "reset_epoch": reset, "reset_iso": when}
        log(f"Rate limit: {remaining}/{limit}, resets at {when}")
        return snap
    except Exception as e:
        warn(f"Rate limit snapshot failed: {e}")
        return {}

# -------------------------------------------------------------------
# Light helpers for series + ordering
# -------------------------------------------------------------------
def semver_series(tag_name: str) -> str:
    """
    Extract 'vX.Y' series from tags like 'v4.2.3', '4.2.0-rc.1', etc.
    Falls back to 'v0.0' if not parseable.
    """
    m = re.search(r'v?(\d+)\.(\d+)', tag_name or "")
    if not m:
        return "v0.0"
    return f"v{int(m.group(1))}.{int(m.group(2))}"

def sorted_pairs_by_tag_time(pairs: List[Tuple[str, str]], tag_index: Dict[str, Dict[str, Any]]) -> List[Tuple[str, str]]:
    """
    Sort (base, compare) by tag timestamps ascending when both are tags;
    unknown timestamps sort last.
    """
    def ts(tag: str) -> float:
        rec = tag_index.get(tag)
        if rec and rec.get("tag_timestamp"):
            try:
                return datetime.fromisoformat(rec["tag_timestamp"].replace("Z", "+00:00")).timestamp()
            except Exception:
                pass
        return float("inf")
    return sorted(pairs, key=lambda bc: (ts(bc[0]), ts(bc[1])))

log("Bootstrap ready ✓")


Using repo root: /Users/rambodparsi/Desktop/OSS Repository Selection/web_scrapper/commit_pr_issue_analysis
Bootstrap ready ✓


### Message creation

In [41]:
from pathlib import Path
import os, textwrap, csv

print("Repo:", REPO)
print("repo_root():", repo_root())

# 1) Load the base prompt text (exact filename "prompt" at repo root)
prompt_path = Path(REPO_ROOT) / "prompt"
if not prompt_path.exists():
    raise FileNotFoundError(
        f"Couldn't find {prompt_path}. Create a text file named 'prompt' at the repo root."
    )

BASE_PROMPT = prompt_path.read_text(encoding="utf-8").strip()
print("Loaded prompt file:", prompt_path)
print("-" * 60)
print("Prompt preview (first 300 chars):")
print((BASE_PROMPT[:300] + ("…" if len(BASE_PROMPT) > 300 else "")))

# 2) Find per-series CSVs produced by M7
gi_dir = Path(repo_root()) / "gemini_input"
if not gi_dir.exists():
    raise FileNotFoundError(f"Not found: {gi_dir}. Run M7 first to create gemini_input JSON/CSV.")

series_csvs = sorted(gi_dir.glob("v*.csv"))
print("\nFound CSVs:", [p.name for p in series_csvs])

# Optional: respect series_whitelist from config
SERIES_WHITELIST = CFG.get("series_whitelist") or []
if SERIES_WHITELIST:
    before = len(series_csvs)
    series_csvs = [p for p in series_csvs if p.stem in set(SERIES_WHITELIST)]
    print(f"Applied series_whitelist={SERIES_WHITELIST} → {len(series_csvs)}/{before} remain")

if not series_csvs:
    print("⚠️  No series CSVs found to preview.")


Repo: mastodon/mastodon
repo_root(): mastodon
Loaded prompt file: /Users/rambodparsi/Desktop/OSS Repository Selection/web_scrapper/commit_pr_issue_analysis/prompt
------------------------------------------------------------
Prompt preview (first 300 chars):
You are an expert GitHub developer and contributor for a web application. Your job is to analyze a GitHub issue (title + body) and classify it into one of exactly two classes: "UI" or "Other". Follow the class definitions below precisely. Then output a SINGLE JSON object in the required schema. Do n…

Found CSVs: ['v4.0.csv', 'v4.1.csv', 'v4.2.csv', 'v4.3.csv', 'v4.4.csv']
Applied series_whitelist=['v4.3'] → 1/5 remain


In [42]:
from typing import List, Dict

def _clean(s: str) -> str:
    if s is None:
        return ""
    # Collapse excessive whitespace/newlines while preserving paragraphs
    s = s.replace("\r", "")
    lines = [ln.strip() for ln in s.split("\n")]
    # Keep empty lines to preserve user formatting intent
    while lines and lines[0] == "":
        lines.pop(0)
    while lines and lines[-1] == "":
        lines.pop()
    return "\n".join(lines)

def build_messages_from_csv(csv_path: Path, base_prompt: str) -> List[Dict]:
    """
    Returns a list of dicts:
      { "sha": ..., "issue_title": ..., "issue_body": ..., "message": <full text> }
    NOTE: does not write anything to disk.
    """
    rows = []
    with open(csv_path, "r", encoding="utf-8", newline="") as fh:
        reader = csv.DictReader(fh)
        # Expecting columns from M7 extra: sha, issue_title, issue_body, issue_lable
        for r in reader:
            sha  = (r.get("sha") or "").strip()
            it   = _clean(r.get("issue_title") or "")
            ib   = _clean(r.get("issue_body") or "")
            # You asked to classify issues (title+body). If both are empty, skip.
            if not it and not ib:
                continue

            msg = f"""{base_prompt}

Issue title: {it}
Issue body: {ib}"""
            rows.append({
                "sha": sha,
                "issue_title": it,
                "issue_body": ib,
                "message": msg
            })
    return rows

# Build messages for each selected series (in memory)
series_to_messages: Dict[str, List[Dict]] = {}
for csv_path in series_csvs:
    msgs = build_messages_from_csv(csv_path, BASE_PROMPT)
    series_to_messages[csv_path.stem] = msgs
    print(f"{csv_path.name}: prepared {len(msgs)} messages")


v4.3.csv: prepared 129 messages


In [43]:
from IPython.display import HTML, display

EXAMPLES_PER_SERIES = 3  # change freely

def _scroll_block(title, body, height=480):
    # preserves newlines; wraps long lines; scrolls if very long
    html = f"""
    <div style="border:1px solid #ddd;margin:12px 0;padding:10px;">
      <div style="font-weight:600;margin-bottom:6px;">{title}</div>
      <pre style="white-space:pre-wrap;overflow:auto;max-height:{height}px;margin:0;">
{body}
      </pre>
    </div>
    """
    display(HTML(html))

for series, rows in series_to_messages.items():
    display(HTML(f"<h3 style='margin-top:24px'>{series} — showing {min(EXAMPLES_PER_SERIES, len(rows))} example messages</h3>"))
    for i, row in enumerate(rows[:EXAMPLES_PER_SERIES], start=1):
        _scroll_block(
            f"Example {i} (sha: {row['sha']})",
            row["message"]
        )


### LLMs call

In [44]:
# --- M8: Gemini classification (resumable, 8 calls/min, progress bar) ---
import os, time, csv, json, re, unicodedata
from pathlib import Path
from collections import deque

# 0) Prompt: accept either 'prompt' or 'prompt.txt' at repo root
prompt_path = (Path(REPO_ROOT) / "prompt")
if not prompt_path.exists():
    alt = Path(REPO_ROOT) / "prompt.txt"
    if alt.exists():
        prompt_path = alt
if not prompt_path.exists():
    raise FileNotFoundError(f"Prompt file not found at {Path(REPO_ROOT) / 'prompt'} or 'prompt.txt'")
BASE_PROMPT = prompt_path.read_text(encoding="utf-8").strip()

# 1) Install / import Gemini client
try:
    import google.genai as genai
except Exception:
    %pip -q install "google-genai>=0.3.0"
    import google.genai as genai

# 1b) tqdm for progress
try:
    from tqdm.auto import tqdm
except Exception:
    %pip -q install "tqdm>=4.66.0"
    from tqdm.auto import tqdm

# 2) Read API key from .env
GEMINI_TOKEN = os.getenv("GEMINI_TOKEN", "").strip()
if not GEMINI_TOKEN:
    raise RuntimeError("GEMINI_TOKEN is missing. Add GEMINI_TOKEN=... to your .env and restart the kernel.")

# 3) Config
GEMINI_MODEL = "gemini-2.0-flash"   # or "gemini-2.0-pro"
TEMPERATURE = 0.0                   # deterministic label
MAX_RETRIES = 3
DRY_RUN_LIMIT_PER_SERIES = None     # e.g., 25 to test
RPM_LIMIT = 10                       # throttle to 8 requests/rolling minute

# 4) Text sanitizer for LLM input
_ZERO_WIDTH_CODEPOINTS = [
    "\uFEFF", "\u00AD", "\u200B", "\u200C", "\u200D", "\u2060", "\u2063",
    "\u200E", "\u200F", "\u202A", "\u202B", "\u202D", "\u202E", "\u202C",
    "\u2066", "\u2067", "\u2068", "\u2069",
]
_ZW_PATTERN = re.compile("|".join(map(re.escape, _ZERO_WIDTH_CODEPOINTS)))
_SPACE_MAP = {"\u00A0": " ", "\u202F": " ", "\u2007": " "}

def clean_text_for_llm(text: str, collapse_spaces: bool = True, keep_newlines: bool = True) -> str:
    if text is None:
        return ""
    s = str(text).replace("\r\n", "\n").replace("\r", "\n")
    s = unicodedata.normalize("NFKC", s)
    for k, v in _SPACE_MAP.items():
        s = s.replace(k, v)
    s = _ZW_PATTERN.sub("", s)
    def _is_bad(ch: str) -> bool:
        cat = unicodedata.category(ch)
        if ch in ("\n", "\t"):
            return False
        return cat in ("Cc", "Cf")
    s = "".join(ch for ch in s if not _is_bad(ch))
    if collapse_spaces:
        s = s.replace("\t", " ")
        s = re.sub(r"[ ]{2,}", " ", s)
    s = "\n".join(line.rstrip(" ") for line in s.split("\n"))
    if keep_newlines:
        s = re.sub(r"\n{3,}", "\n\n", s)
    else:
        s = " ".join(s.split())
    s = s.encode("utf-8", "replace").decode("utf-8")
    return s

def compose_message(base_prompt: str, issue_title: str, issue_body: str) -> str:
    it = clean_text_for_llm(issue_title, collapse_spaces=False)
    ib = clean_text_for_llm(issue_body,  collapse_spaces=False)
    return f"""{base_prompt}

Issue title: {it}
Issue body: {ib}"""

def call_gemini_classify(client: genai.Client, model: str, message: str) -> str:
    last_err = None
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = client.models.generate_content(
                model=model,
                contents=message,
                config={"temperature": TEMPERATURE},
            )
            return (getattr(resp, "text", "") or "").strip()
        except Exception as e:
            last_err = e
            time.sleep(min(2 ** attempt, 8))  # 1s,2s,4s,8s capped
    print(f"⚠️  Gemini call failed after {MAX_RETRIES} retries: {last_err}")
    return ""

# 5) Sliding-window rate limiter (8 RPM)
class PerMinuteLimiter:
    """Allow at most `max_calls` in any rolling 60s window."""
    def __init__(self, max_calls: int):
        self.max_calls = int(max_calls)
        self._hits = deque()  # seconds timestamps of recent calls
    def wait_for_slot(self):
        now = time.time()
        while self._hits and now - self._hits[0] >= 60.0:
            self._hits.popleft()
        if len(self._hits) < self.max_calls:
            self._hits.append(now)
            return
        sleep_for = 60.0 - (now - self._hits[0]) + 0.01
        if sleep_for > 0:
            time.sleep(sleep_for)
        self._hits.append(time.time())

limiter = PerMinuteLimiter(RPM_LIMIT)

# 6) Inputs & outputs
gi_dir = Path(repo_root()) / "gemini_input"
pred_dir = Path(repo_root()) / "gemini_pred"
pred_dir.mkdir(parents=True, exist_ok=True)

series_csvs = sorted(gi_dir.glob("v*.csv"))
SERIES_WHITELIST = CFG.get("series_whitelist") or []
if SERIES_WHITELIST:
    before = len(series_csvs)
    series_csvs = [p for p in series_csvs if p.stem in set(SERIES_WHITELIST)]
    print(f"Applied series_whitelist={SERIES_WHITELIST} → {len(series_csvs)}/{before} remain")

if not series_csvs:
    raise FileNotFoundError(f"No input CSVs found in {gi_dir}. Run M7 first.")

print("Input series:", [p.stem for p in series_csvs])

# 7) Run (resumable)
client = genai.Client(api_key=GEMINI_TOKEN)
written = 0

for csv_path in series_csvs:
    series = csv_path.stem
    out_csv = pred_dir / f"{series}.csv"

    # Read input rows
    with open(csv_path, "r", encoding="utf-8", newline="") as fh:
        reader = csv.DictReader(fh)
        in_rows = list(reader)

    if not in_rows:
        print(f"⚠️  Empty input: {csv_path.name}")
        continue

    # Load existing output (if any) to resume
    existing_by_sha = {}
    existing_fieldnames = None
    if out_csv.exists():
        with open(out_csv, "r", encoding="utf-8", newline="") as fh_out_in:
            reader_out = csv.DictReader(fh_out_in)
            existing_fieldnames = reader_out.fieldnames
            for r in reader_out:
                existing_by_sha[r.get("sha")] = r

    # Prepare output writer (rewrite file with merged rows; safe because we write as we go)
    fieldnames = list(in_rows[0].keys())
    if "gemini_label" not in fieldnames:
        fieldnames += ["gemini_label"]

    total = len(in_rows)
    pbar = tqdm(total=total, desc=f"{series}", unit="row", leave=True)

    calls_made = 0
    wrote = 0
    reused = 0
    skipped_no_issue = 0
    newly_classified = 0

    with open(out_csv, "w", encoding="utf-8", newline="") as fh_out:
        w = csv.DictWriter(fh_out, fieldnames=fieldnames)
        w.writeheader()

        for idx, r in enumerate(in_rows, start=1):
            sha    = (r.get("sha") or "").strip()
            ititle = (r.get("issue_title") or "").strip()
            ibody  = (r.get("issue_body")  or "").strip()

            # If we have an existing row for this sha and either:
            #  - it has a non-empty gemini_label, OR
            #  - the row has no issue content (we don't call API for those),
            # then reuse that row (resume behavior).
            existing = existing_by_sha.get(sha)
            if existing:
                existing_label = (existing.get("gemini_label") or "").strip()
                # prefer the existing row if it had a label already, or no issue content
                if existing_label or (not ititle and not ibody):
                    # make sure all columns are present (in case input columns changed)
                    row_to_write = {k: existing.get(k, r.get(k, "")) for k in fieldnames}
                    w.writerow(row_to_write)
                    wrote += 1
                    reused += 1
                    pbar.update(1)
                    pbar.set_postfix_str(f"{total - idx} remain (reused {reused})")
                    continue

            # If no issue content → skip API, empty label
            if not ititle and not ibody:
                r["gemini_label"] = ""
                w.writerow(r)
                wrote += 1
                skipped_no_issue += 1
                pbar.update(1)
                pbar.set_postfix_str(f"{total - idx} remain (skipped {skipped_no_issue})")
                continue

            # Optional dry-run limiter
            if DRY_RUN_LIMIT_PER_SERIES is not None and calls_made >= DRY_RUN_LIMIT_PER_SERIES:
                r["gemini_label"] = ""
                w.writerow(r)
                wrote += 1
                pbar.update(1)
                pbar.set_postfix_str(f"{total - idx} remain (dry-run)")
                continue

            # Compose & throttle
            msg = compose_message(BASE_PROMPT, ititle, ibody)
            limiter.wait_for_slot()

            # Call Gemini
            label = call_gemini_classify(client, GEMINI_MODEL, msg)
            r["gemini_label"] = label

            w.writerow(r)
            wrote += 1
            newly_classified += 1
            calls_made += 1

            # progress
            pbar.update(1)
            pbar.set_postfix_str(f"{total - idx} remain (new {newly_classified}, reused {reused})")

    pbar.close()
    print(f"✓ {series}: wrote {wrote} rows → {out_csv.name} | "
          f"API calls: {calls_made}, reused: {reused}, no-issue: {skipped_no_issue}, new: {newly_classified}")
    written += 1

print(f"Done. Series written: {written}  →  Output dir: {pred_dir}")


Applied series_whitelist=['v4.3'] → 1/5 remain
Input series: ['v4.3']


v4.3:  32%|███▏      | 213/656 [32:59<1:08:37,  9.29s/row, 443 remain (skipped 8)]
v4.3:   1%|▏         | 9/656 [00:14<22:25,  2.08s/row, 647 remain (skipped 8)]      

⚠️  Gemini call failed after 3 retries: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit.\n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 200\nPlease retry in 46.386059128s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash'}, 'quotaValue': '200'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.

KeyboardInterrupt: 

In [None]:
# Update the most recent CSV in gemini_pred with:
#   - gemini_pred_label: "No Issue" if no issue; else JSON["class"] from gemini_label ('' if missing)
#   - github_label: "No Issue" if no issue; else UI/Other/None based on labels
import csv, json, re
from pathlib import Path

pred_dir = Path(repo_root()) / "gemini_pred"
if not pred_dir.exists():
    raise FileNotFoundError(f"Not found: {pred_dir}. Run the Gemini prediction step first.")

# Pick the most recently modified v*.csv
csv_files = sorted(pred_dir.glob("v*.csv"), key=lambda p: p.stat().st_mtime, reverse=True)
if not csv_files:
    raise FileNotFoundError(f"No v*.csv files found in {pred_dir}")
target_csv = csv_files[0]
print(f"Target file: {target_csv.name}")

# --- helpers ---
CODE_FENCE_RE = re.compile(r"```(?:json|JSON)?\s*(.*?)\s*```", re.DOTALL)
FIRST_OBJECT_RE = re.compile(r"\{.*?\}", re.DOTALL)
CLASS_PAIR_RE  = re.compile(r'"class"\s*:\s*"([^"]+)"')

def has_issue_text(title: str, body: str) -> bool:
    return bool((title or "").strip() or (body or "").strip())

def extract_json_class(raw: str) -> str:
    """Return JSON['class'] from gemini_label (supports ```json ... ``` fences & noisy text)."""
    if raw is None:
        return ""
    s = str(raw).strip()

    # 1) If fenced, strip the fence
    m = CODE_FENCE_RE.search(s)
    if m:
        s = m.group(1).strip()

    # 2) Try direct JSON parse
    try:
        obj = json.loads(s)
        if isinstance(obj, dict) and "class" in obj:
            val = obj.get("class")
            return "" if val is None else str(val)
    except Exception:
        pass

    # 3) Try to isolate first {...} blob and parse
    m = FIRST_OBJECT_RE.search(s)
    if m:
        candidate = m.group(0)
        try:
            obj = json.loads(candidate)
            if isinstance(obj, dict) and "class" in obj:
                val = obj.get("class")
                return "" if val is None else str(val)
        except Exception:
            pass

    # 4) Last-resort regex
    m = CLASS_PAIR_RE.search(s)
    if m:
        return m.group(1)

    return ""

def derive_github_label_from_issue(labels_primary: str, labels_alt: str) -> str:
    """
    Labels → 'UI' / 'Other' / 'None'
    - 'UI' if any label equals 'ui' (case-insensitive)
    - 'Other' if labels exist but none are 'ui'
    - 'None' if no labels
    Accepts JSON list or comma/semicolon-separated text.
    """
    data = labels_primary if (labels_primary and str(labels_primary).strip()) else labels_alt
    if not data or str(data).strip() == "":
        return "None"

    raw = str(data).strip()
    items = []
    # Try JSON list first
    try:
        obj = json.loads(raw)
        if isinstance(obj, list):
            items = [str(x) for x in obj]
    except Exception:
        # Fallback to splitting
        items = re.split(r"[;,]", raw)

    tokens = [t.strip().lower() for t in items if str(t).strip()]
    if not tokens:
        return "None"
    if any(t == "ui" for t in tokens):
        return "UI"
    return "Other"

# --- read, update in place ---
with open(target_csv, "r", encoding="utf-8", newline="") as fh:
    rdr = csv.DictReader(fh)
    rows = list(rdr)
    base_fields = list(rdr.fieldnames or [])

if not rows:
    print(f"⚠️  Empty file: {target_csv.name}")
else:
    out_fields = base_fields[:]
    for col in ("gemini_pred_label", "github_label"):
        if col not in out_fields:
            out_fields.append(col)

    total = len(rows)
    with open(target_csv, "w", encoding="utf-8", newline="") as fh_out:
        w = csv.DictWriter(fh_out, fieldnames=out_fields)
        w.writeheader()

        for i, r in enumerate(rows, start=1):
            ititle = (r.get("issue_title") or "")
            ibody  = (r.get("issue_body")  or "")
            has_issue = has_issue_text(ititle, ibody)

            # gemini_pred_label
            if not has_issue:
                r["gemini_pred_label"] = "No Issue"
            else:
                r["gemini_pred_label"] = extract_json_class(r.get("gemini_label"))

            # github_label
            if not has_issue:
                r["github_label"] = "No Issue"
            else:
                r["github_label"] = derive_github_label_from_issue(
                    r.get("issue_lable"),  # your earlier spelling
                    r.get("issue_label")   # fallback if present
                )

            w.writerow(r)

            # lightweight progress
            if i % 50 == 0 or i == total:
                print(f"{i}/{total} processed — {total - i} remain", end="\r")

    print(f"\n✓ Updated: {target_csv.name} — columns added/updated: gemini_pred_label, github_label (rows: {total})")


Target file: v4.2.csv
603/603 processed — 0 remainnn
✓ Updated: v4.2.csv — columns added/updated: gemini_pred_label, github_label (rows: 603)
