In [None]:
cd ..

In [None]:
import pandas as pd

from tqdm import tqdm
from xml.etree import ElementTree as ET

tqdm.pandas()

# Settings

In [None]:
DATA_FILE = "data/SB_publication_PMC_with_xml.parquet"
OUTPUT_FILE = "data/SB_publication_PMC_data.csv"

# Read data

In [None]:
df = pd.read_parquet(DATA_FILE)
df.head()

# Retrieve contents

## Article type

In [None]:
from xml.etree import ElementTree as ET

def get_article_title(xml_bytes: bytes) -> str | None:
    """
    Extract the article title from the XML content, ignoring formatting tags
    (e.g., <italic>, <bold>, <underline>), but keeping their inner text.

    Args:
        xml_bytes (bytes): The XML content as bytes.

    Returns:
        str | None: The cleaned article title, or None if not found.
    """
    root = ET.fromstring(xml_bytes)
    title_elem = root.find(".//article-title")

    if title_elem is None:
        return None

    # Join all inner text (drops tags but keeps their text content)
    title_text = "".join(title_elem.itertext()).strip()

    # Normalize whitespace
    return " ".join(title_text.split())

In [None]:
df["title_extracted"] = df["xml"].progress_apply(get_article_title)

In [None]:
import re

# Greek letter mapping
greek_map = {
    'α': 'alpha', 'β': 'beta', 'γ': 'gamma', 'δ': 'delta', 'ε': 'epsilon',
    'ζ': 'zeta', 'η': 'eta', 'θ': 'theta', 'ι': 'iota', 'κ': 'kappa',
    'λ': 'lambda', 'μ': 'mu', 'ν': 'nu', 'ξ': 'xi', 'ο': 'omicron',
    'π': 'pi', 'ρ': 'rho', 'σ': 'sigma', 'τ': 'tau', 'υ': 'upsilon',
    'φ': 'phi', 'χ': 'chi', 'ψ': 'psi', 'ω': 'omega',
    'Α': 'alpha', 'Β': 'beta', 'Γ': 'gamma', 'Δ': 'delta', 'Ε': 'epsilon',
    'Ζ': 'zeta', 'Η': 'eta', 'Θ': 'theta', 'Ι': 'iota', 'Κ': 'kappa',
    'Λ': 'lambda', 'Μ': 'mu', 'Ν': 'nu', 'Ξ': 'xi', 'Ο': 'omicron',
    'Π': 'pi', 'Ρ': 'rho', 'Σ': 'sigma', 'Τ': 'tau', 'Υ': 'upsilon',
    'Φ': 'phi', 'Χ': 'chi', 'Ψ': 'psi', 'Ω': 'omega'
}

def clean_title(s: str) -> str:
    if not isinstance(s, str):
        return ''
    
    # Replace Greek letters
    for greek, latin in greek_map.items():
        s = s.replace(greek, latin)
    
    # Remove anything inside [brackets]
    s = re.sub(r'\[.*?\]', '', s)
    
    # Remove unwanted phrases
    s = re.sub(r'Author Correction:', '', s, flags=re.IGNORECASE)
    
    # Lowercase and keep only letters
    s = s.lower()
    s = re.sub(r'[^a-z]', '', s)
    
    return s

# Compare cleaned titles
df["incorrect_title"] = df.apply(
    lambda row: clean_title(row["Title"]) != clean_title(row["title_extracted"]),
    axis=1
)

In [None]:
df["incorrect_title"].sum()

In [None]:
incorrect = df[df["incorrect_title"]][["pmc", "Title", "title_extracted"]]
incorrect

In [None]:
for _, row in incorrect.iterrows():
    print(row["pmc"])
    print(row["Title"])
    print(row["title_extracted"])
    print("")

In [None]:
df[df["incorrect_title"]][["Title", "title_extracted"]].to_csv("titles.csv")