In [None]:
cd ../..

In [None]:
import pandas as pd

from Bio import Entrez
from tqdm import tqdm
from xml.etree import ElementTree as ET

tqdm.pandas()

# Settings

In [None]:
SB_PUBLICATIONS_PMC_CLEAN = "data/SB_publication_PMC_clean.csv"
OUTPUT_FILE = "data/SB_publication_PMC_with_xml.parquet"

# Functions

In [None]:
def retrieve_xml(pmc: str) -> bytes:
    """
    Retrieve the full XML record of a PubMed Central (PMC) article.
    Args:
        pmc (str): The PMC ID of the article
    Returns:
        bytes: The raw XML content of the article as returned by the Entrez API.
    """
    Entrez.email = "ENTER_YOUR_EMAIL_HERE"
    retries = 2
    delay = 10
    for attempt in range(retries + 1):
        try:
            with Entrez.efetch(db="pmc", id=pmc) as handle:
                return handle.read()
        except HTTPError as e:
            if attempt < retries:
                print(f"Failed to fetch PMC{pmc}. Retrying in {delay} seconds")
                time.sleep(delay)
            else:
                print(f"Failed to fetch PMC{pmc} after {retries + 1} attempts: {e}")
                return None

# Read data

In [None]:
df = pd.read_csv(SB_PUBLICATIONS_PMC_CLEAN, sep="|")
df.head()

# Retrieve contents

In [None]:
df["pmc"] = df["link"].apply(lambda x: x.split("/")[-2].replace("PMC", ""))
df.head()

In [None]:
df["xml"] = df["pmc"].progress_apply(retrieve_xml)
df.head()

# Save file

In [None]:
df.to_parquet(OUTPUT_FILE)