In [None]:
cd ../..

In [None]:
import pandas as pd

from tqdm import tqdm
from xml.etree import ElementTree as ET

tqdm.pandas()

# Settings

In [None]:
DATA_FILE = "data/SB_publication_PMC_with_xml.parquet"
OUTPUT_FILE = "data/SB_publication_PMC_data.csv"

# Read data

In [None]:
df = pd.read_parquet(DATA_FILE)
df.head()

# Retrieve contents

## Article type

In [None]:
def get_article_type(xml_bytes: bytes) -> str:
    """
    Extract the article type from the XML content.
    Args:
        xml_bytes (bytes): The XML content as bytes.
    Returns:
        str: The article type with hyphens replaced by spaces.        
    """
    root = ET.fromstring(xml_bytes)
    article_elem = root.find(".//article")
    article_type = article_elem.get("article-type") if article_elem is not None else None
    return article_type.replace("-", " ")

In [None]:
df["article_type"] = df["xml"].progress_apply(get_article_type)

In [None]:
df["article_type"].value_counts()

## Language

In [None]:
def get_article_lang(xml_bytes: bytes) -> str | None:
    """
    Extract the article language from the XML content.
    Args:
        xml_bytes (bytes): The XML content as bytes.
    Returns:
        str | None: The article language in lowercase, or None if not found.
    """
    root = ET.fromstring(xml_bytes)
    article_elem = root.find(".//article")
    lang = article_elem.get("{http://www.w3.org/XML/1998/namespace}lang") if article_elem is not None else None
    return lang.lower()

In [None]:
df["language"] = df["xml"].progress_apply(get_article_lang)

In [None]:
df["language"].value_counts()

## Journal

In [None]:
def get_journal_title(xml_bytes: bytes) -> str | None:
    """
    Extract the journal title from the XML content.
    Args:
        xml_bytes (bytes): The XML content as bytes.
    Returns:
        str | None: The journal title, or None if not found.
    """
    root = ET.fromstring(xml_bytes)
    elem = root.find(".//journal-title-group/journal-title")
    return elem.text.strip() if elem is not None and elem.text else None

In [None]:
df["journal"] = df["xml"].progress_apply(get_journal_title)

In [None]:
df["journal"].value_counts()

## Publisher

In [None]:
def get_publisher_name(xml_bytes: bytes) -> str | None:
    """
    Extract the publisher name from the XML content.
    Args:
        xml_bytes (bytes): The XML content as bytes.
    Returns:
        str | None: The publisher name, or None if not found.
    """
    root = ET.fromstring(xml_bytes)
    elem = root.find(".//publisher/publisher-name")
    return elem.text.strip() if elem is not None and elem.text else None

In [None]:
df["publisher"] = df["xml"].progress_apply(get_publisher_name)

In [None]:
df["publisher"].value_counts()

## Publication year

In [None]:
def get_publication_year(xml_bytes: bytes) -> str | None:
    """
    Extract the publication year from the XML content.
    Args:
        xml_bytes (bytes): The XML content as bytes.
    Returns:
        str | None: The publication year, or None if not found.
    """
    root = ET.fromstring(xml_bytes)
    elem = root.find(".//pub-date/year")
    return elem.text.strip() if elem is not None and elem.text else None

In [None]:
df["publication_year"] = df["xml"].progress_apply(get_publication_year)

In [None]:
df["publication_year"].value_counts()

# Save file

In [None]:
df.head()

In [None]:
df = df.drop(columns="xml")

In [None]:
df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8", sep="|")