In [2]:
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from typing import Dict, Any, List
import pandas as pd
import subprocess

In [3]:
YEARLY_BASELINE = "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/"
DATA = Path("./data")
TEXT = DATA / "text"
SUBJECT_KEYWORDS = list(i.lower() for i in [
    'Esophageal Cancer', 'Esophageal Neoplasm', 'Esophageal Carcinoma', 'Esophagus Cancer',
    'Esophagus Tumor', 'Esophagus Carcinoma', 'Esophageal Squamous Cell Carcinoma',
    'Esophageal Adenocarcinoma', 'Esophageal Malignancy', 'Esophageal Oncology',
    'Esophageal Tumor', 'Esophagogastric Junction Cancer', 'Esophageal Staging',
    'Esophageal Metastasis', 'Esophageal Chemotherapy', 'Esophageal Radiotherapy',
    'Esophageal Surgery', 'Esophageal Resection', 'Esophageal Dysplasia', "Barrett's Esophagus", " ESCA "
])

In [4]:
res = requests.get(YEARLY_BASELINE)
gzs = BeautifulSoup(res.text, "html.parser")

In [5]:
TEXT.mkdir(exist_ok=True, parents=True)

In [6]:
# find download links

links = [a["href"] for a in gzs.find_all("a") if a["href"].endswith(".gz")]

In [7]:
len(links)

1166

In [6]:
links[:5]

['pubmed23n0001.xml.gz',
 'pubmed23n0002.xml.gz',
 'pubmed23n0003.xml.gz',
 'pubmed23n0004.xml.gz',
 'pubmed23n0005.xml.gz']

In [7]:
def download_and_extract(link):
    res = requests.get(YEARLY_BASELINE + link)
    with open(Path(DATA) / Path(link).name, "wb") as f:
        f.write(res.content)
    # unpack a gz file
    subprocess.run(["gunzip", DATA/Path(link).name])
    return DATA/(Path(link).name.replace(".gz", ""))


In [8]:
local_data = download_and_extract(links[-1])

In [17]:
def parse_xml(xml_file) -> List[Dict[str, Any]]:
    """
    Parse the xml file and return a list of dict
    """
    rows = []
    with open(xml_file, "r") as f:
        soup = BeautifulSoup(f.read(), "xml")
        for article in soup.find_all("PubmedArticle"):
            row = {}
            pmc_dom = article.find("ArticleId", {"IdType": "pmc"})
            if pmc_dom is None:
                continue
            row["pmc"] = pmc_dom.text

            pmid_dom = article.find("PMID")
            if pmid_dom is None:
                continue
            row["pmid"] = pmid_dom.text

            year = article.find("PubDate").find("Year")
            if year is None:
                continue
            if year.text < "2010":
                continue
            row["year"] = year.text

            article_title_dom = article.find("ArticleTitle")
            if article_title_dom is None:
                continue
            row["title"] = article_title_dom.text

            abstract_dom = article.find("AbstractText")
            if abstract_dom is None:
                continue
            row["abstract"] = abstract_dom.text

            keyword_doms = article.find_all("Keyword")
            if keyword_doms is None:
                continue
            row["keywords"] = [keyword_dom.text for keyword_dom in keyword_doms]

            rows.append(row)
            # return rows

    return rows



In [18]:
df = pd.DataFrame(parse_xml(local_data))

In [19]:
len(df)

4024

In [20]:
df.head()

Unnamed: 0,pmc,pmid,year,title,abstract,keywords
0,PMC9707731,36465062,2022,"The scale-up finance gap in the EU: Causes, co...",This paper assesses the financing gap faced by...,"[European union, Policy design, Scale-up, Scal..."
1,PMC9716603,36465064,2022,The Use of Advanced Semiautomated Bone Segment...,Weightbearing computed tomography (WBCT) measu...,"[Hallux rigidus, WBCT, automatic, deformity, i..."
2,PMC9708711,36465065,2022,Implications of providing social support to cl...,Social support is a strong predictor of social...,"[close network members, emotional support, inf..."
3,PMC9524795,36465067,2021,Nanoparticle-mediated Delivery of IL-2 To T Fo...,We recently reported that poly lactic-co-glyco...,"[T cells, T follicular helper (TFH) cells, aut..."
4,PMC9524789,36465068,2021,Long-term Outcomes of Patients with Systemic L...,"To study the long-term outcomes, in the contex...","[organ damage, prognosis, systemic lupus eryth..."


In [21]:
df.to_csv(DATA/Path(local_data).name.replace(".xml", ".csv"), index=False)

## Filter the articles

In [34]:
def filter_articles(row):
    for keyword in SUBJECT_KEYWORDS:
        if (keyword in row['title'].lower()) or (keyword in row['abstract'].lower()) or (keyword in ' '.join(row['keywords'])).lower():
            return True
    return False

filtered_df = df[df.apply(filter_articles, axis=1)].reset_index(drop=True)

print(f"Total number of filtered articles: {len(filtered_df)}/{len(df)}")

filtered_df.head()

Total number of filtered articles: 13/4024


Unnamed: 0,pmc,pmid,year,title,abstract,keywords
0,PMC9713848,36465381,2022,Advancements in photodynamic therapy of esopha...,The poor prognosis of patients with esophageal...,"[5-ALA, Barrett’s esophagus, esophageal cancer..."
1,PMC9714501,36465399,2022,Research trends on anti-PD-1/PD-L1 immunothera...,The study aims to summarize publication charac...,"[CiteSpace, HistCite, VOSviewer, Web of Scienc..."
2,PMC9713002,36465939,2022,Risk factors of lymph node metastasis or lymph...,Lymphovascular invasion (LVI) is mostly used a...,"[lymph node metastasis, lymphovascular invasio..."
3,PMC9709130,36466456,2022,Esophagus cancer and essential trace elements.,Numerous epidemiological and laboratory studie...,"[copper, esophagus cancer, essential trace ele..."
4,PMC9712015,36466711,2022,The Role of Alternative Splicing Factors hnRNP...,Alternative splicing (AS) has been widely demo...,[]


At this point we can just download the txt dump from [this ftp](https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/oa_noncomm/txt/): But in this notebook we just crawl them one by one for example

In [41]:
def download_pmc_text(pmc_id: str) -> str:
    """
    Download the text from PMC
    """
    url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc_id}/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
    }
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    return soup.find("div", {"id": "mc"}).text

In [42]:
download_pmc_text(filtered_df.loc[0, "pmc"])

'Front Oncol. 2022; 12: 1024576. Published online 2022 Nov 17.  doi:\xa010.3389/fonc.2022.1024576PMCID: PMC9713848PMID: 36465381Advancements in photodynamic therapy of esophageal cancerDorota Bartusik-Aebisher,\n1\n,\n*\n Michał Osuchowski,\n2\n Marta Adamczyk,\n3\n Joanna Stopa,\n2\n Grzegorz Cieślar,\n4\n Aleksandra Kawczyk-Krupka,\n4\n,\n*\n and  David Aebisher\n5\n,\n*\nDorota Bartusik-Aebisher\n1\nDepartment of Biochemistry and General Chemistry, Medical College of The University of Rzeszów, Rzeszów, Poland\nFind articles by Dorota Bartusik-AebisherMichał Osuchowski\n2\nMedical College of The University of Rzeszów, Rzeszów, Poland\nFind articles by Michał OsuchowskiMarta Adamczyk\n3\nMedical Faculty, Medical University of Warsaw, Warsaw, Poland\nFind articles by Marta AdamczykJoanna Stopa\n2\nMedical College of The University of Rzeszów, Rzeszów, Poland\nFind articles by Joanna StopaGrzegorz Cieślar\n4\nDepartment of Internal Medicine, Angiology, and Physical Medicine, Center for 

In [44]:
from traceback import format_exc

In [45]:
ERRORS = []
for i, row in filtered_df.iterrows():
    try:
        with open(TEXT / f"{row['pmc']}.txt", "w") as f:
            f.write(download_pmc_text(row["pmc"]))
    except Exception as e:
        print(f"Error downloading {row['pmc']}")
        ERRORS.append(dict(
            pmc=row["pmc"],
            error=str(e),
            traceback=format_exc()
        ))
        