In [1]:
import requests
import time
import re
import json
from tqdm import tqdm
from rapidfuzz import fuzz
import pandas as pd
from tqdm import tqdm


In [2]:
data = pd.read_csv('/Users/ainsleylewis/Documents/Astronomy/arXiver/FINAL_ARXIV_2025_copy_updated.csv')

In [3]:
OPENALEX = "https://api.openalex.org"
HEADERS = {"User-Agent": "astro-citation-pipeline/1.0"}
REQUEST_DELAY = 0.12   # ~8 req/s (safe on laptop)
MAX_CITATIONS_FETCH = 300   # above this â†’ skip self-cite filtering

In [4]:
def extract_arxiv_id(pdf_link):
    if not isinstance(pdf_link, str):
        return None
    m = re.search(r'arxiv\.org/(pdf|abs)/([\w\.]+)', pdf_link)
    return m.group(2) if m else None

In [5]:
import urllib.parse

def oa_get(url, params=None):
    time.sleep(REQUEST_DELAY)

    if params and "filter" in params:
        params = params.copy()
        params["filter"] = urllib.parse.quote(params["filter"], safe=":,")
    
    r = requests.get(url, params=params, headers=HEADERS)
    if r.status_code == 400:
        return None
    r.raise_for_status()
    return r.json()


In [6]:
def sanitize_title(title, max_len=200):
    if not isinstance(title, str):
        return ""
    return title[:max_len]

In [7]:
import re
import unicodedata

def normalize_title(title):
    title = unicodedata.normalize("NFKD", title)
    title = re.sub(r"\$.*?\$", "", title)   # remove LaTeX math
    title = re.sub(r"[^\w\s]", " ", title)  # remove punctuation
    title = re.sub(r"\s+", " ", title)
    return title.strip().lower()

In [8]:
def resolve_by_arxiv(arxiv_id):
    params = {
        "filter": f"ids.arxiv:{arxiv_id}",
        "per-page": 1
    }
    res = oa_get(f"{OPENALEX}/works", params)
    if not res or not res.get("results"):
        return None
    return res["results"][0]


In [9]:
def resolve_by_title(title, authors):
    norm_title = normalize_title(title)

    res = None  # <-- CRITICAL LINE

    params = {
        "filter": f"title.search:{norm_title}",
        "per-page": 10
    }

    res = oa_get(f"{OPENALEX}/works", params)

    if res and res.get("results"):
        return res["results"][0]

    # Fallback: full-text search
    params = {
        "search": norm_title,
        "per-page": 5
    }

    res = oa_get(f"{OPENALEX}/works", params)

    if res and res.get("results"):
        return res["results"][0]

    return None


In [10]:
def get_affiliations(work):
    affs = set()
    for auth in work.get("authorships", []):
        for inst in auth.get("institutions", []):
            affs.add(inst["display_name"])
    return list(affs)

In [11]:
def get_author_ids(work):
    return {
        a["author"]["id"]
        for a in work.get("authorships", [])
        if a.get("author")
    }

In [12]:
def count_nonself_citations(work_id, original_authors, total_cites):
    if total_cites > MAX_CITATIONS_FETCH:
        return None  # mark for ADS later

    cursor = "*"
    nonself = 0

    while cursor:
        params = {
            "filter": f"cites:{work_id}",
            "per-page": 200,
            "cursor": cursor
        }
        res = oa_get(f"{OPENALEX}/works", params)

        for citing in res["results"]:
            citing_authors = {
                a["author"]["id"]
                for a in citing.get("authorships", [])
                if a.get("author")
            }
            if not original_authors & citing_authors:
                nonself += 1

        cursor = res["meta"].get("next_cursor")

    return nonself

In [13]:
results = []
checkpoint_file = "openalex_results.json"

# Resume if interrupted
try:
    with open(checkpoint_file) as f:
        results = json.load(f)
        start_idx = len(results)
except FileNotFoundError:
    start_idx = 0

In [14]:

pbar = tqdm(
    range(start_idx, len(data)),
    initial=start_idx,
    total=len(data),
    desc="Processing papers",
    unit="paper"
)

for i in pbar:
    row = data.iloc[i]

    title = row["title"]
    title = sanitize_title(title)
    authors = row["authors"]
    pdf_link = row["pdf_link"]

    try:
        arxiv_id = extract_arxiv_id(pdf_link)
        work = None

        if arxiv_id:
            work = resolve_by_arxiv(arxiv_id)

        if not work:
            work = resolve_by_title(title, authors)

        if not work:
            results.append({
                "title": title,
                "status": "not_found"
            })
            continue

        work_id = work["id"]
        total_cites = work.get("cited_by_count", 0)
        affiliations = get_affiliations(work)
        author_ids = get_author_ids(work)

        nonself = count_nonself_citations(
            work_id, author_ids, total_cites
        )

        results.append({
            "title": title,
            "openalex_id": work_id,
            "affiliations": affiliations,
            "citations_total": total_cites,
            "citations_nonself": nonself,
            "needs_ads": nonself is None
        })
        pbar.set_postfix({
    "cites": total_cites,
    "ADS": nonself is None})


    except Exception as e:
        results.append({
            "title": title,
            "error": str(e)
        })

    # checkpoint every 50 rows
    if i % 50 == 0:
        with open(checkpoint_file, "w") as f:
            json.dump(results, f, indent=2)


Processing papers:   0%|          | 0/18272 [00:00<?, ?paper/s]

Processing papers:   0%|          | 3/18272 [00:10<17:56:58,  3.54s/paper, cites=0, ADS=0]


KeyboardInterrupt: 

In [15]:
results

[{'title': 'Insights on Galaxy Evolution from Interpretable Sparse Feature Networks',
  'openalex_id': 'https://openalex.org/W4407556519',
  'affiliations': [],
  'citations_total': 0,
  'citations_nonself': 0,
  'needs_ads': False},
 {'title': 'X-ray reverberation black hole mass and distance estimates of Cygnus X-1',
  'openalex_id': 'https://openalex.org/W4406779223',
  'affiliations': [],
  'citations_total': 0,
  'citations_nonself': 0,
  'needs_ads': False},
 {'title': 'The TES-based Cryogenic AntiCoincidence Detector of ATHENA X-IFU: Validation of the thermal end-to-end simulator towards the updated Demonstration Model (DM 1.1)',
  'status': 'not_found'}]