In [1]:
cd ../..

/Users/didac.fortuny/Repos/ddc/nasa_space_biology_knowledge_engine


In [2]:
import pandas as pd

from tqdm import tqdm
from xml.etree import ElementTree as ET

tqdm.pandas()

# Settings

In [3]:
DATA_FILE = "data/SB_publication_PMC_with_xml.parquet"
OUTPUT_FILE = "data/SB_publication_PMC_data.csv"

# Read data

In [4]:
df = pd.read_parquet(DATA_FILE)
df.head()

Unnamed: 0,Title,Link,pmc,xml
0,Mice in Bion-M 1 space mission: training and s...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,4136787,"b'<?xml version=""1.0"" ?><!DOCTYPE pmc-article..."
1,Microgravity induces pelvic bone loss through ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,3630201,"b'<?xml version=""1.0"" ?><!DOCTYPE pmc-article..."
2,Stem Cell Health and Tissue Regeneration in Mi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,11988870,"b'<?xml version=""1.0"" ?><!DOCTYPE pmc-article..."
3,Microgravity Reduces the Differentiation and R...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,7998608,"b'<?xml version=""1.0"" ?><!DOCTYPE pmc-article..."
4,Microgravity validation of a novel system for ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,5587110,"b'<?xml version=""1.0"" ?><!DOCTYPE pmc-article..."


# Retrieve contents

## Article type

In [5]:
def get_article_type(xml_bytes: bytes) -> str:
    """
    Extract the article type from the XML content.
    Args:
        xml_bytes (bytes): The XML content as bytes.
    Returns:
        str: The article type with hyphens replaced by spaces.        
    """
    root = ET.fromstring(xml_bytes)
    article_elem = root.find(".//article")
    article_type = article_elem.get("article-type") if article_elem is not None else None
    return article_type.replace("-", " ")

In [6]:
df["article_type"] = df["xml"].progress_apply(get_article_type)

100%|██████████| 607/607 [00:01<00:00, 490.79it/s]


In [7]:
df["article_type"].value_counts()

article_type
research article      512
review article         44
brief report           27
correction             12
article commentary      3
other                   3
editorial               2
letter                  1
reply                   1
methods article         1
discussion              1
Name: count, dtype: int64

## Language

In [8]:
def get_article_lang(xml_bytes: bytes) -> str | None:
    """
    Extract the article language from the XML content.
    Args:
        xml_bytes (bytes): The XML content as bytes.
    Returns:
        str | None: The article language in lowercase, or None if not found.
    """
    root = ET.fromstring(xml_bytes)
    article_elem = root.find(".//article")
    lang = article_elem.get("{http://www.w3.org/XML/1998/namespace}lang") if article_elem is not None else None
    return lang.lower()

In [9]:
df["language"] = df["xml"].progress_apply(get_article_lang)

100%|██████████| 607/607 [00:01<00:00, 501.58it/s]


In [10]:
df["language"].value_counts()

language
en    607
Name: count, dtype: int64

## Journal

In [11]:
def get_journal_title(xml_bytes: bytes) -> str | None:
    """
    Extract the journal title from the XML content.
    Args:
        xml_bytes (bytes): The XML content as bytes.
    Returns:
        str | None: The journal title, or None if not found.
    """
    root = ET.fromstring(xml_bytes)
    elem = root.find(".//journal-title-group/journal-title")
    return elem.text.strip() if elem is not None and elem.text else None

In [12]:
df["journal"] = df["xml"].progress_apply(get_journal_title)

100%|██████████| 607/607 [00:01<00:00, 499.84it/s]


In [13]:
df["journal"].value_counts()

journal
NPJ Microgravity                                                            47
International Journal of Molecular Sciences                                 30
Scientific Reports                                                          29
PLoS ONE                                                                    26
Frontiers in Plant Science                                                  25
                                                                            ..
Micromachines                                                                1
Journal of Otolaryngology - Head & Neck Surgery                              1
The spine journal : official journal of the North American Spine Society     1
Spine                                                                        1
Lab on a Chip                                                                1
Name: count, Length: 205, dtype: int64

## Publisher

In [14]:
def get_publisher_name(xml_bytes: bytes) -> str | None:
    """
    Extract the publisher name from the XML content.
    Args:
        xml_bytes (bytes): The XML content as bytes.
    Returns:
        str | None: The publisher name, or None if not found.
    """
    root = ET.fromstring(xml_bytes)
    elem = root.find(".//publisher/publisher-name")
    return elem.text.strip() if elem is not None and elem.text else None

In [15]:
df["publisher"] = df["xml"].progress_apply(get_publisher_name)

100%|██████████| 607/607 [00:01<00:00, 490.92it/s]


In [16]:
df["publisher"].value_counts()

publisher
Nature Publishing Group                                                       117
Multidisciplinary Digital Publishing Institute  (MDPI)                         80
Frontiers Media SA                                                             50
American Society for Microbiology (ASM)                                        42
Oxford University Press                                                        35
BMC                                                                            33
PLOS                                                                           32
Elsevier                                                                       28
Wiley                                                                          14
National Academy of Sciences                                                   13
Mary Ann Liebert, Inc.                                                          9
American Physiological Society                                                  8
The Fe

## Publication year

In [17]:
def get_publication_year(xml_bytes: bytes) -> str | None:
    """
    Extract the publication year from the XML content.
    Args:
        xml_bytes (bytes): The XML content as bytes.
    Returns:
        str | None: The publication year, or None if not found.
    """
    root = ET.fromstring(xml_bytes)
    elem = root.find(".//pub-date/year")
    return elem.text.strip() if elem is not None and elem.text else None

In [18]:
df["publication_year"] = df["xml"].progress_apply(get_publication_year)

100%|██████████| 607/607 [00:01<00:00, 504.23it/s]


In [19]:
df["publication_year"].value_counts()

publication_year
2024    78
2021    57
2022    55
2020    53
2023    52
2018    44
2017    41
2019    38
2015    35
2016    32
2013    30
2025    29
2014    23
2011    15
2012    14
2010    11
Name: count, dtype: int64

# Save file

In [20]:
df.head()

Unnamed: 0,Title,Link,pmc,xml,article_type,language,journal,publisher,publication_year
0,Mice in Bion-M 1 space mission: training and s...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,4136787,"b'<?xml version=""1.0"" ?><!DOCTYPE pmc-article...",research article,en,PLoS ONE,PLOS,2014
1,Microgravity induces pelvic bone loss through ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,3630201,"b'<?xml version=""1.0"" ?><!DOCTYPE pmc-article...",research article,en,PLoS ONE,PLOS,2013
2,Stem Cell Health and Tissue Regeneration in Mi...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,11988870,"b'<?xml version=""1.0"" ?><!DOCTYPE pmc-article...",review article,en,International Journal of Molecular Sciences,Multidisciplinary Digital Publishing Institute...,2025
3,Microgravity Reduces the Differentiation and R...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,7998608,"b'<?xml version=""1.0"" ?><!DOCTYPE pmc-article...",research article,en,Cells,Multidisciplinary Digital Publishing Institute...,2021
4,Microgravity validation of a novel system for ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,5587110,"b'<?xml version=""1.0"" ?><!DOCTYPE pmc-article...",research article,en,PLoS ONE,PLOS,2017


In [21]:
df = df.drop(columns="xml")

In [22]:
df.to_csv(OUTPUT_FILE, index=False, encoding="utf-8", sep="|")