In [1]:
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
import time
import os


In [2]:
def download_pdf_with_retry(url, file_path, headers=None, num_retries=3):
    retries = 0
    success = False
    while retries < num_retries and not success:
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # 如果响应状态码不是200，将引发HTTPError异常
            with open(file_path, 'wb') as f:
                f.write(response.content)
            success = True
        except requests.HTTPError as e:
            print(f"Attempt {retries + 1} failed for {url}: {e}")
            retries += 1
            time.sleep(5)  # 在重试之间稍微延迟一下，避免连续请求

def search_pubmed_and_download_pdfs(email, term, download_folder="new_pdfs"):
    Entrez.email = email  # 请将其替换为您自己的电子邮件地址
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36"
    headers = {"User-Agent": user_agent}

    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    with requests.Session() as session:  # 使用会话保持某些状态（例如cookies）
        session.headers.update(headers)

        # 使用Entrez搜索PubMed条目
        handle = Entrez.esearch(db="pubmed", term=term, retmax=20)
        record = Entrez.read(handle)
        handle.close()

        for pubmed_id in record["IdList"]:
            try:
                # 尝试获取PMC ID，以构造PDF URL
                handle = Entrez.elink(dbfrom="pubmed", id=pubmed_id, linkname="pubmed_pmc")
                record = Entrez.read(handle)
                handle.close()

                pmc_id_links = record[0]["LinkSetDb"]
                if pmc_id_links and 'Link' in pmc_id_links[0]:
                    pmc_id = pmc_id_links[0]["Link"][0]["Id"]
                    pdf_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmc_id}/pdf/"
                    file_path = os.path.join(download_folder, f"PMC{pmc_id}.pdf")

                    download_pdf_with_retry(pdf_url, file_path, headers=headers, num_retries=3)
                else:
                    print(f"No PMC article available for PubMed ID: {pubmed_id}")
            except Exception as e:
                print(f"Error occurred for PubMed ID {pubmed_id}: {e}")

In [4]:
# 替换以下字段，并运行函数
your_email = "shou@nd.edu"  # 请用您的邮箱地址替换
search_term = "Differential Oxygen Exposure Modulates Mesenchymal Stem Cell Metabolism and Proliferation through mTOR Signaling"  # 请输入您的搜索关键词
search_pubmed_and_download_pdfs(your_email, search_term)


In [2]:
from Bio import Entrez
import json

# 设置您的邮箱地址（必须），这是Entrez API的要求
Entrez.email = "shou@nd.edu"

# 指定要搜索的关键词
search_term = "Differential Oxygen Exposure Modulates Mesenchymal Stem Cell Metabolism and Proliferation through mTOR Signaling"

# 使用Entrez的esearch函数搜索文章，并获取PubMed ID（PMID）列表
search_results = Entrez.esearch(db="pubmed", term=search_term, retmax=2)  # 设置retmax以限制结果数量
search_results = Entrez.read(search_results)

# 获取文章的摘要（abstract）
pmids = search_results["IdList"]
for pmid in pmids:
    print(pmid)
    # 使用Entrez的efetch函数获取PubMed文章的详细信息，包括摘要
    article = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
    xml_data = Entrez.read(article)
    formatted_xml = json.dumps(xml_data, indent=4)
    # print(formatted_xml)
    
    # print(xml_data['PubmedArticle'][0]['MedlineCitation']['Article'])
    # 提取摘要部分
    # abstract = xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText']
    
    # 打印摘要
    # print(f"PMID: {pmid}")
    # print(f"Abstract: {abstract}\n")



35409106
