In [3]:
#!/usr/bin/env python3
"""
scopus_dblp_merge.py
====================
Retrieve author metrics from Scopus *and* DBLP, compute DBLP-based h-index,
and export a combined CSV.

Usage examples
--------------
# Single author, name only (exact or partial):
python scopus_dblp_merge.py "Andrew Y. Ng"

# Multiple authors, mixed names / explicit Scopus IDs:
python scopus_dblp_merge.py "Yoshua Bengio" 7004212771 "Hinton G."

Outputs
-------
 - author_metrics.csv  (summary table)
 - author_<scopusID>.json  (raw Scopus profile)
 - dblp_<pid>.json        (raw DBLP record list)
"""

import argparse, os, time, json, math, sys
from pathlib import Path

import requests, pandas as pd
from tqdm import tqdm
from pybliometrics.scopus import AuthorRetrieval

###############################################################################
# Helper: polite HTTP GET with exponential back-off
###############################################################################
def _safe_get(url: str, params: dict | None = None, headers: dict | None = None,
              tries: int = 5, pause: float = 0.5):
    for attempt in range(1, tries + 1):
        r = requests.get(url, params=params or {}, headers=headers or {})
        if r.status_code == 200:
            return r
        if r.status_code == 429:          # rate-limited
            sleep = pause * 2**attempt
        else:                             # transient error
            sleep = pause
        time.sleep(sleep)
    r.raise_for_status()


###############################################################################
# 1. Scopus block
###############################################################################
SCOPUS_KEY = os.getenv("SCOPUS_API_KEY")
if not SCOPUS_KEY:
    sys.exit("❌  SCOPUS_API_KEY not set in environment -- aborting.")

def scopus_search_author_id(query: str) -> str | None:
    """Return the first matching Scopus Author ID for a free-text name."""
    url = "https://api.elsevier.com/content/search/author"
    params = {"query": query, "apiKey": SCOPUS_KEY, "count": 1}
    r = _safe_get(url, params)
    results = r.json().get("search-results", {}).get("entry", [])
    if results:
        return results[0]["dc:identifier"].split(":")[1]     # strip 'AUTHOR_ID:'
    return None

def get_scopus_profile(author_id: str) -> dict:
    """Fetch full profile via pybliometrics AuthorRetrieval."""
    ar = AuthorRetrieval(author_id, refresh=True)
    profile = {
        "name": f"{ar.given_name} {ar.surname}",
        "scopus_id": author_id,
        "scopus_h": ar.h_index,
        "scopus_doc_count": ar.document_count,
        "current_affiliation": str(ar.current_affiliation) if ar.current_affiliation else None,
    }
    # Persist raw JSON for transparency
    Path(f"author_{author_id}.json").write_text(json.dumps(ar._json, indent=2))
    return profile


###############################################################################
# 2. DBLP block
###############################################################################
def dblp_pid_from_name(name: str) -> str | None:
    url = "https://dblp.org/search/author/api"
    r = _safe_get(url, params={"q": name, "format": "json"})
    hits = r.json()["result"]["hits"]["hit"]
    return hits[0]["info"]["authorid"] if hits else None

def dblp_pubs(pid: str) -> list[dict]:
    url = f"https://dblp.org/pid/{pid}.json"
    r = _safe_get(url)
    data = r.json()["result"]["hits"]["hit"]
    Path(f"dblp_{pid}.json").write_text(json.dumps(data, indent=2))
    return data


###############################################################################
# 3. Citation counts via Semantic Scholar
###############################################################################
S2_KEY = os.getenv("S2_API_KEY")
S2_HEADERS = {"x-api-key": S2_KEY} if S2_KEY else {}

def get_citation_count(doi: str) -> int:
    url = f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}"
    r = _safe_get(url, params={"fields": "citationCount"}, headers=S2_HEADERS)
    return r.json().get("citationCount", 0)


def compute_h_index(citations: list[int]) -> int:
    """Classic h-index calculation."""
    citations.sort(reverse=True)
    return sum(c >= i + 1 for i, c in enumerate(citations))


###############################################################################
# 4. Orchestration per author
###############################################################################
def process_author(token: str) -> dict:
    """
    token = Scopus ID (all digits) OR free text author name.
    Returns merged metrics dict.
    """
    if token.isdigit():
        scopus_id = token
    else:
        scopus_id = scopus_search_author_id(token)
        if not scopus_id:
            raise ValueError(f"Scopus author not found for query “{token}”")

    # ----- Scopus -----
    scopus_profile = get_scopus_profile(scopus_id)

    # ----- DBLP (use name from Scopus to improve precision) -----
    pid = dblp_pid_from_name(scopus_profile["name"])
    if not pid:
        dblp_h = dblp_docs = 0
    else:
        pubs = dblp_pubs(pid)
        dois = [p["info"].get("doi") for p in pubs if p["info"].get("doi")]
        citations = []
        for doi in tqdm(dois, desc=f"Citations for {scopus_profile['name']}"):
            citations.append(get_citation_count(doi))
            time.sleep(0.1)   # stay friendly to the API
        dblp_h = compute_h_index(citations)
        dblp_docs = len(pubs)

    merged = {
        "name": scopus_profile["name"],
        "scopus_id": scopus_id,
        "dblp_pid": pid,
        "scopus_h": scopus_profile["scopus_h"],
        "dblp_h": dblp_h,
        "scopus_docs": scopus_profile["scopus_doc_count"],
        "dblp_docs": dblp_docs,
        "affiliation": scopus_profile["current_affiliation"],
    }
    return merged


###############################################################################
# 5. Command-line interface
###############################################################################
def main():
    parser = argparse.ArgumentParser(
        description="Merge Scopus and DBLP author metrics."
    )
    parser.add_argument("authors", nargs="+",
                        help="Author names or Scopus IDs (digits).")
    args = parser.parse_args()

    rows = []
    for token in args.authors:
        try:
            rows.append(process_author(token))
        except Exception as exc:
            print(f"⚠️  Skipped “{token}”: {exc}")

    if not rows:
        sys.exit("No successful records → nothing to write.")

    df = pd.DataFrame(rows)
    df.to_csv("author_metrics.csv", index=False)
    print("\n✅  Saved merged metrics → author_metrics.csv")


if __name__ == "__main__":
    main()

TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'