In [1]:
# !pip install biopython tqdm

In [2]:
from Bio import Entrez
import json
from tqdm import tqdm
from pathlib import Path
import time
import re

In [3]:
Entrez.email = ...  # <-- email сюда
Entrez.tool = "DownloadAbstracts"

In [5]:
assert re.match(
    r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$',
    Entrez.email
)

In [6]:
DB = "pubmed"

QUERY = """
(
    mental disorders
    OR
    behavioural disorders
    OR
    "Психические расстройства"
    OR
    "расстройства поведения"
)
AND
(
    Russian[la]
    OR
    Russia[ad]
    OR
    "Россия"
    OR
    russian[lang]
)
""".strip()

N_RESULTS = 2000  # <-- Количество статей на скачивание

BATCH_SIZE = 100  # <-- Сколько скачивать за раз

SORT = "relevance"

DATADIR = Path.cwd().parent.resolve() / "data"
if not DATADIR.exists():
    DATADIR.mkdir()

In [7]:
with Entrez.esearch(
    db=DB,
    term=QUERY,
    retmax=N_RESULTS,
    sort=SORT
) as handle:
    search_results = Entrez.read(handle)

In [8]:
id_list = search_results["IdList"]
len(id_list)

2000

In [9]:
abstracts = []

In [10]:
for start in tqdm(range(0, len(id_list), BATCH_SIZE)):
    end = min(
        len(id_list),
        start + BATCH_SIZE
    )

    with Entrez.efetch(
        db="pubmed",
        id=id_list[start:end],
        rettype="xml",
        retmode="xml"
    ) as handle:
        articles = Entrez.read(handle)["PubmedArticle"]

    for article in articles:
        pubmed_id = article["MedlineCitation"]["PMID"]
        article_data = article["MedlineCitation"]["Article"]

        try:
            title = article['MedlineCitation']["Article"]['VernacularTitle']
        except Exception:
            title = article_data.get("ArticleTitle", "avaliable 💔")

        other_abstract_list = article.get("MedlineCitation", dict()).get("OtherAbstract", list())
        if other_abstract_list:
            abstract_parts = other_abstract_list[0]['AbstractText']

        ## С пропуском если нет русского абстракта ##
        else:
            continue

        # ##  С загрузкой английских абстрактов  ##
        # elif article_data.get("Abstract", dict()).get("AbstractText", list()):
        #     abstract_parts = article_data["Abstract"]["AbstractText"]
        # else:
        #     count_unavailable += 1
        #     abstract_parts = "No Abstract avaliable 💔"

        if isinstance(abstract_parts, list):
            abstract = ' '.join([part for part in abstract_parts if part])
        else:
            abstract = abstract_parts

        abstracts.append({
            'pubmed_id': pubmed_id,
            'title': title,
            'abstract': abstract,
        })


    time.sleep(0.3)  # Нужна задержка чтобы не насиловать сервер

100%|██████████| 20/20 [00:57<00:00,  2.87s/it]


In [11]:
tot = len(abstracts)
print(f"{tot}/{N_RESULTS} fetched.")
print(f"{N_RESULTS - tot}/{N_RESULTS} unavaliable.")

159/2000 fetched.
1841/2000 unavaliable.


In [12]:
with open(DATADIR / "abstracts.json", "w", encoding="utf-8") as file:
    json.dump(abstracts, file, indent=4, ensure_ascii=False)