In [2]:
# !pip install biopython tqdm

In [8]:
from Bio import Entrez
import json
from tqdm import tqdm
from pathlib import Path
import time
import re

In [13]:
Entrez.email = ...  # <-- email сюда
Entrez.tool = "DownloadAbstracts"

In [14]:
assert re.match(
    r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$',
    Entrez.email
)

TypeError: expected string or bytes-like object

In [18]:
DB = "pubmed"

QUERY = """
(
    mental disorders
    OR
    behavioural disorders
    OR
    "Психические расстройства"
    OR
    "расстройства поведения"
)
AND
(
    Russian[la]
    OR
    Russia[ad]
    OR
    "Россия"
    OR
    russian[lang]
)
""".strip()

N_RESULTS = 2000  # <-- Количество статей на скачивание

BATCH_SIZE = 100  # <-- Сколько скачивать за раз

SORT = "relevance"

DATADIR = Path.cwd().parent.resolve() / "data"
if not DATADIR.exists():
    DATADIR.mkdir()

In [5]:
with Entrez.esearch(
    db=DB,
    term=QUERY,
    retmax=N_RESULTS,
    sort=SORT
) as handle:
    search_results = Entrez.read(handle)

In [6]:
dict(search_results)

{'Count': '20795',
 'RetMax': '2000',
 'RetStart': '0',
 'IdList': ['38147386', '37966434', '37341087', '31851180', '37315252', '38147388', '38465821', '37141130', '31626176', '40879228', '39003693', '30335078', '7687494', '33993676', '39435780', '30141781', '29053128', '28625946', '33459538', '34460165', '34932286', '39731385', '17929646', '28374687', '37141133', '33081457', '35038848', '35271233', '38142343', '35238509', '39576167', '32790986', '26978045', '9926554', '31747145', '38147379', '28635935', '1780774', '28884729', '1775034', '28635739', '9139505', '28884713', '20517227', '10849959', '31793547', '26978057', '32678544', '2699141', '36170106', '13248152', '1775457', '38676687', '31513157', '636720', '18927971', '27456721', '7687495', '27240175', '23739516', '31156228', '40577175', '3577541', '40627427', '4722260', '32678551', '2633571', '7941905', '33340302', '16544905', '39945172', '6720187', '8048299', '38529860', '1332341', '18379487', '1661525', '11811122', '29171481', '6

In [7]:
id_list = search_results["IdList"]
len(id_list)

2000

In [8]:
abstracts = []

In [9]:
for start in tqdm(range(0, len(id_list), BATCH_SIZE)):
    end = min(
        len(id_list),
        start + BATCH_SIZE
    )

    with Entrez.efetch(
        db="pubmed",
        id=id_list[start:end],
        rettype="xml",
        retmode="xml"
    ) as handle:
        articles = Entrez.read(handle)["PubmedArticle"]

    for article in articles:
        pubmed_id = article["MedlineCitation"]["PMID"]
        article_data = article["MedlineCitation"]["Article"]

        try:
            title = article['MedlineCitation']["Article"]['VernacularTitle']
        except Exception:
            title = article_data.get("ArticleTitle", "avaliable 💔")

        other_abstract_list = article.get("MedlineCitation", dict()).get("OtherAbstract", list())
        if other_abstract_list:
            abstract_parts = other_abstract_list[0]['AbstractText']

        ## С пропуском если нет русского абстракта ##
        else:
            continue

        # ##  С загрузкой английских абстрактов  ##
        # elif article_data.get("Abstract", dict()).get("AbstractText", list()):
        #     abstract_parts = article_data["Abstract"]["AbstractText"]
        # else:
        #     count_unavailable += 1
        #     abstract_parts = "No Abstract avaliable 💔"

        if isinstance(abstract_parts, list):
            abstract = ' '.join([part for part in abstract_parts if part])
        else:
            abstract = abstract_parts

        abstracts.append({
            'pubmed_id': pubmed_id,
            'title': title,
            'abstract': abstract,
        })


    time.sleep(0.3)  # Нужна задержка чтобы не насиловать сервер

100%|██████████| 20/20 [00:56<00:00,  2.83s/it]


In [10]:
tot = len(abstracts)
print(f"{tot}/{N_RESULTS} fetched.")
print(f"{N_RESULTS - tot}/{N_RESULTS} unavaliable.")

159/2000 fetched.
1841/2000 unavaliable.


In [None]:
with open(DATADIR / "abstracts.json", "w", encoding="utf-8") as file:
    json.dump(abstracts, file, indent=4, ensure_ascii=False)