In [None]:
!pip3 install scholarly

In [None]:
import scholarly

In [None]:
from scholarly import scholarly
import re
import requests
import unicodedata
from datetime import datetime
import time

def sanitize_key(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode()
    return re.sub(r'\W+', '', text)

def extract_first_word(title):
    words = re.findall(r'\b\w+\b', title)
    return sanitize_key(words[0]) if words else "Untitled"

def guess_entry_type(pub):
    bib = pub.get('bib', {})
    if 'journal' in bib:
        return 'article'
    elif 'title' in bib:
        return 'inproceedings'
    else:
        return 'misc'

def fetch_booktitle_via_crossref(title, year, retries=7, delay=3):
    """Try CrossRef up to `retries` times with exponential backoff on timeout."""
    for attempt in range(retries):
        try:
            url = "https://api.crossref.org/works"
            params = {
                "query.title": title,
                "rows": 1,
                "filter": f"from-pub-date:{year},until-pub-date:{year}"
            }
            resp = requests.get(url, params=params, timeout=10)
            resp.raise_for_status()
            data = resp.json()
            items = data.get("message", {}).get("items", [])
            if items:
                container = items[0].get("container-title", [])
                return container[0] if container else ""
            break
        except Exception as e:
            print(f"⚠️ CrossRef error (attempt {attempt+1}/{retries}) for '{title}': {e}")
            if attempt < retries - 1:
                time.sleep(delay * (2 ** attempt))  # exponential backoff
    return ""

def format_bibtex(pub, entry_type):
    bib = pub.get('bib', {})
    title = bib.get('title', 'Unknown Title')
    authors = bib.get('author', 'Unknown Author')
    year = bib.get('pub_year', '????')
    month = bib.get('pub_month', datetime.now().month)

    journal = bib.get('journal', '')
    booktitle = bib.get('booktitle', '') or fetch_booktitle_via_crossref(title, year)

    first_author = authors.split(' and ')[0].split()[-1]
    first_word = extract_first_word(title)
    key = f"{first_author}{year}{first_word}"

    lines = [f"@{entry_type}{{{key},",
             f"  author = {{{authors}}},",
             f"  title = {{{title}}},"]

    if entry_type == 'inproceedings' and booktitle:
        lines.append(f"  booktitle = {{{booktitle}}},")
    if entry_type == 'article' and journal:
        lines.append(f"  journal = {{{journal}}},")

    lines.extend([
        f"  month = {{{month}}},",
        f"  year = {{{year}}},",
        f"  bibtex_show = {{true}}",
        f"}}\n"
    ])
    return "\n".join(lines)

def fetch_and_save(user_id, output_file='scholar.bib', max_pubs=None):
    author = scholarly.search_author_id(user_id)
    author = scholarly.fill(author, sections=['publications'])
    entries = {'article': [], 'inproceedings': [], 'misc': []}

    for i, pub in enumerate(author['publications']):
        if max_pubs and i >= max_pubs:
            break
        filled = scholarly.fill(pub)
        et = guess_entry_type(filled)
        bibtex = format_bibtex(filled, et)
        entries[et].append(bibtex)
        print(f"✔ {et}: {filled['bib'].get('title', '')}")

    with open(output_file, 'w', encoding='utf-8') as f:
        for et in ['article', 'inproceedings', 'misc']:
            if entries[et]:
                f.write(f"% === {et.upper()} ===\n\n")
                f.writelines(entries[et])

    print(f"\n✅ Saved {sum(len(v) for v in entries.values())} entries to {output_file}")

# Run it
fetch_and_save('1g1i1B4AAAAJ', max_pubs=50)

⚠️ CrossRef error (attempt 1/7) for 'On the selection of optimum Savitzky-Golay filters': HTTPSConnectionPool(host='api.crossref.org', port=443): Read timed out. (read timeout=10)
✔ article: On the selection of optimum Savitzky-Golay filters
✔ article: Suppression of the zero-order term in off-axis digital holography through nonlinear filtering
⚠️ CrossRef error (attempt 1/7) for 'Complex ambiguity-free Fourier domain optical coherence tomography through transverse scanning': HTTPSConnectionPool(host='api.crossref.org', port=443): Read timed out. (read timeout=10)
✔ article: Complex ambiguity-free Fourier domain optical coherence tomography through transverse scanning
✔ article: Vocalizations of wild Asian elephants (Elephas maximus): structural classification and social context
✔ article: Snakes with an ellipse-reproducing property
✔ inproceedings: An iterative algorithm for phase retrieval with sparsity constraints: application to frequency domain optical coherence tomography
✔ inpro