In [None]:
import os, time, json, argparse, datetime as dt
from typing import Dict, Optional, Set, Tuple, List
import requests

In [None]:
YEARS_BACK = 20
END_YEAR   = dt.date.today().year
OUT_DIR    = "/kaggle/working/inspire_jobs"

BASE_URL   = "https://inspirehep.net/api/jobs"
PAGE_SIZE  = 1000     # API max per page
SLEEP_S    = 0.35     # be polite; helps with rate limits
TIMEOUT_S  = 60
RETRY_BACKOFF = [2, 4, 8]  # seconds for transient errors

# Avoid email-bearing fields for ToU compliance (e.g., contact_details, reference_letters).
FIELDS = [
    "position","ranks","regions","status","deadline_date",
    "arxiv_categories","accelerator_experiments","institutions",
    "description","urls","public_notes",
    "external_job_identifier","external_system_identifiers",
    "control_number","legacy_creation_date","legacy_version","deleted","deleted_records",
]

HEADERS = {"User-Agent": "kaggle-inspire-jobs-downloader/1.0 (+https://www.kaggle.com/)"}

In [None]:
# ---------------- Helper functions --------------
def month_ranges(year: int) -> List[Tuple[str, str]]:
    out = []
    for m in range(1, 13):
        start = dt.date(year, m, 1)
        end = (dt.date(year + (m==12), 1 if m==12 else m+1, 1) - dt.timedelta(days=1))
        out.append((start.isoformat(), end.isoformat()))
    return out

def build_params(q_str: str, first_page: bool = True) -> Dict[str, str]:
    if not first_page:
        return {}
    return {
        "q": q_str,
        "size": str(PAGE_SIZE),
        "fields": ",".join(FIELDS),
        "sort": "mostrecent",  # jobs also supports 'deadline'
    }

def get_json(url: str, params: Optional[Dict[str, str]] = None) -> Dict:
    for attempt, backoff in enumerate([0] + RETRY_BACKOFF):
        if backoff:
            time.sleep(backoff)
        resp = requests.get(url, headers=HEADERS, params=params, timeout=TIMEOUT_S)
        if resp.status_code == 429:
            time.sleep(6)  # wait out the window
            continue
        if 500 <= resp.status_code < 600:
            continue
        resp.raise_for_status()
        return resp.json()
    resp.raise_for_status()
    return {}

def count_only(query: str) -> int:
    # Peek first page to read total
    data = get_json(BASE_URL, build_params(query, first_page=True) | {"page": "1"})
    total = data.get("hits", {}).get("total", 0)
    return total if isinstance(total, int) else total.get("value", 0)

def page_through(query: str, writer, ids_seen: Set[str]) -> int:
    url = BASE_URL
    params = build_params(query, first_page=True)
    wrote = 0
    while url:
        data = get_json(url, params)
        params = None  # subsequent: follow absolute links.next
        hits = data.get("hits", {}).get("hits", [])
        for hit in hits:
            rec_id = hit.get("id") or hit.get("metadata", {}).get("control_number")
            if rec_id and rec_id in ids_seen:
                continue
            if rec_id:
                ids_seen.add(rec_id)
            writer.write(json.dumps(hit, ensure_ascii=False) + "\n")
            wrote += 1
        url = data.get("links", {}).get("next")
        if url:
            time.sleep(SLEEP_S)
    return wrote

def download_year(year: int, out_dir: str) -> int:
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"jobs_{year}.jsonl")

    y0, y1 = f"{year}-01-01", f"{year}-12-31"
    year_q = f"deadline_date:[{y0} TO {y1}]"

    print(f"\n=== {year} ===")
    total_est = count_only(year_q)
    print(f"Estimated results: {total_est}")

    ids_seen: Set[str] = set()
    wrote_total = 0
    with open(out_path, "w", encoding="utf-8") as f:
        if total_est <= 9500:  # well under the ~10k/search ceiling
            wrote_total = page_through(year_q, f, ids_seen)
        else:
            print("Large year; splitting by month to avoid per-query cap…")
            for (start, end) in month_ranges(year):
                sub_q = f"deadline_date:[{start} TO {end}]"
                sub_est = count_only(sub_q)
                print(f"  {start}..{end}: ~{sub_est}")
                wrote_total += page_through(sub_q, f, ids_seen)

    print(f"Wrote {wrote_total} records → {out_path}")
    return wrote_total

In [None]:
start_year = END_YEAR - (YEARS_BACK - 1)
print(f"Downloading INSPIRE HEP job ads {start_year}..{END_YEAR} → {OUT_DIR}")
totals = {}
for y in range(start_year, END_YEAR + 1):
    totals[y] = download_year(y, OUT_DIR)

print("\nSummary (records per file):")
for y in range(start_year, END_YEAR + 1):
    print(f"  {y}: {totals.get(y, 0)}")

print("\nFiles written:")
for fn in sorted(os.listdir(OUT_DIR)):
    if fn.endswith(".jsonl"):
        print(" ", os.path.join(OUT_DIR, fn))