Extract text and selected metadata from [PubMed Central](https://www.ncbi.nlm.nih.gov/pmc/) articles.

In [1]:
DATA_DIR = "D:/ml_data/pubmed/"
DATA_FILE = "pmc_result_vaccine_allergy_1year.xml"  # 758MB

In [2]:
import xml.etree.ElementTree as ET
tree = ET.parse(DATA_DIR + DATA_FILE)
articles =  tree.getroot().findall('article')

In [3]:
import pandas as pd


def print_tag_hierarchy(e, depth=0):
    indent = ''.join(['.' for i in range(depth)])
    print(indent, e.tag)
    for child in e:
        print_tag_hierarchy(child, depth + 1)


def get_section_text(sec, header_path=[]):
    # sec elements can be nested.
    # Here I assume all narrative text is within <p> elements.
    my_header_path = header_path.copy()
    title_element = sec.find('title')
    if title_element is not None:  # type(title_element) != type(None): 'NoneType':
        my_header_path.append( ''.join(title_element.itertext()) ) 
    my_title = ' || '.join(my_header_path)  # flattened path for (sub)section header
    results = [] # list of tuples (title, text) tuples
    for child in sec:
        if child.tag == 'p':
            results.append( tuple(( my_title, ''.join(child.itertext()) )) )
        elif child.tag == 'sec': # subsection
            subsection_results = get_section_text(child, my_header_path)
            for ssres in subsection_results:
                results.append(ssres)
    
    return results


def get_title(art):
    title = 'MISSING'
    title_list = art.findall("front/article-meta/title-group/article-title")
    if len(title_list) > 0:
        title = ''.join(title_list[0].itertext())
    return title


def get_abstract(art):
    abstract = 'MISSING'
    abstract_element = art.find("front/article-meta/abstract")
    results = []
    if abstract_element is not None:
        for c in abstract_element:
            if c.tag == 'p':
                t = ''.join( [t for t in c.itertext()])
                results.append( tuple(( 'Abstract', t )) )
            elif c.tag == 'sec':
                sub_results = get_section_text(c, ['Abstract'])
                for sr in sub_results:
                    results.append(sr)
    return results


def get_article_id(art):
    art_id_list = art.findall('front/article-meta/article-id')
    # art_ids = []
    # for aid in art_id_list:
    #     if aid.attrib['pub-id-type'] == 'pmid':
    #         art_ids.append(f"pmid:{aid.text}")
    #     if aid.attrib['pub-id-type'] == 'pmc':
    #         art_ids.append(f"pmc:{aid.text}")
    # return '||'.join(art_ids)
    pmid = 'MISSING'
    for aid in art_id_list:
        if aid.attrib['pub-id-type'] == 'pmid':
            pmid = aid.text
            break
    return pmid
    

def get_article_text(art):
    """
    art: an ElementTree representng an article from Pubmed Central results
    Returns a list with one tuple for each paragraph in each section in an article. 
    The elements of each tuple are a flattened section heading path and a paragraph text string.
    """
    results = []

    results.append( tuple(( 'Title', get_title(art) )) )

    abstract_rows = get_abstract(art)
    for abrow in abstract_rows:
        results.append(abrow)
    
    for sec in art.findall('body/sec'):
        for ssres in get_section_text(sec):
            results.append(ssres)
            
    return results


def get_all_articles(articles):
    results = []
    for art in articles:
        art_id = get_article_id(art)
        art_rows = get_article_text(art)
        paragraph_number = 0
        for arow in art_rows:
            results.append( tuple((art_id, paragraph_number, arow[0], arow[1])) )
            paragraph_number += 1
    return pd.DataFrame(results, columns=['pmid', 'paragraph_number', 'section_path', 'text'])

In [4]:
# Examine the tag tree for one article

print_tag_hierarchy( articles[0] )

 article
. processing-meta
.. restricted-by
. front
.. journal-meta
... journal-id
... journal-id
... journal-id
... journal-id
... journal-title-group
.... journal-title
... issn
... issn
... publisher
.... publisher-name
.... publisher-loc
.. article-meta
... article-id
... article-id
... article-id
... article-id
... article-id
... article-categories
.... subj-group
..... subject
.... subj-group
..... subject
... title-group
.... article-title
..... styled-content
.... alt-title
... contrib-group
.... contrib
..... name
...... surname
...... given-names
..... contrib-id
..... xref
...... sup
..... address
...... email
.... contrib
..... name
...... surname
...... given-names
..... xref
...... sup
..... xref
...... sup
.... contrib
..... name
...... surname
...... given-names
..... contrib-id
..... xref
...... sup
.... contrib
..... name
...... surname
...... given-names
..... contrib-id
..... xref
...... sup
.... contrib
..... name
...... surname
...... given-names
..... xref
......

In [8]:
# get_article_text(articles[ 4 ]) # articles[3] has subsections in abstract

# for article in articles: print(get_title(article))

get_all_articles(articles[3:6])

Unnamed: 0,pmid,paragraph_number,section_path,text
0,36785543,0,Title,Vaccine confidence among those living with all...
1,36785543,1,Abstract || Background,Reports of allergic reactions to coronavirus d...
2,36785543,2,Abstract || Objectives,Our aim was to review the academic and gray li...
3,36785543,3,Abstract || Methods,"We searched 4 databases (CINAHL, PsycINFO, MED..."
4,36785543,4,Abstract || Results,Of the 1385 unique records retrieved from our ...
...,...,...,...,...
110,36644019,46,Contributorship statement,"MG supervised evaluation project, revised data..."
111,36644019,47,Declaration of Competing interest,All authors declare that we have no conflicts ...
112,36644019,48,Consent,All authors consent to publication of this man...
113,36644019,49,Funding,There was no external funding for this service...


In [27]:
csv_file = DATA_FILE.replace('.xml', '.csv')

big_article_table = get_all_articles(articles)

big_article_table.to_csv(DATA_DIR + csv_file, index=False)

In [28]:
import random

sample_PMIDs = random.sample(sorted(set(big_article_table['pmid'])), 3)

example_data = big_article_table[ [row['pmid'] in sample_PMIDs for row in big_article_table.to_dict(orient='records') ] ]

example_data

Unnamed: 0,pmid,paragraph_number,section_path,text
13182,37327781,0,Title,Multi-omic longitudinal study reveals immune c...
13183,37327781,1,Abstract,"The IMPACC cohort, composed of >1,000 hospital..."
13184,37327781,2,Introduction,"Throughout the COVID-19 pandemic, scientists w..."
13185,37327781,3,Introduction,The clinical manifestations of COVID-19 are di...
13186,37327781,4,Introduction,"Here, we carried out deep immunophenotyping of..."
...,...,...,...,...
175683,37783679,43,Methods || ELISA and Western Blot,Concentration of IL-8 in the cell supernatant ...
175684,37783679,44,Methods || NAD+/NADH measurement,"Total extracellular, intracellular and intraba..."
175685,37783679,45,Methods || Reporting summary,Further information on research design is avai...
175686,37783679,46,Supplementary information,\nSupplementary Information\nPeer Review File\...


In [29]:
example_data.to_csv("example_data.csv", index=False)