In [1]:
import pandas as pd
import json
from Bio import Entrez

In [2]:
# Set the email address to avoid any potential issues with Entrez
Entrez.email = 'rdc47@duke.edu'

In [3]:
# Define lists of authors and topics
authors = []  # Example authors, adjust as needed
topics = ['RNAi', "siRNA", "ASO", "mRNA"]  # Example topics, adjust as needed

In [4]:
# Define date range
date_range = '("2020/01/01"[Date - Create] : "2024/09/18"[Date - Create])'

In [5]:
def parse_pub_date(pub_date): 
    if 'Year' in pub_date: 
        year = pub_date['Year']
        month = pub_date.get('Month', 'Jan')
        day = pub_date.get('Day', '01')
        return f"{year}-{month}-{day}"
    return "Not Available"

In [6]:
# Build the query dynamically based on the available authors and topics
queries = []

if authors:
    author_queries = ['{}[Author]'.format(author) for author in authors]
    queries.append('(' + ' OR '.join(author_queries) + ')')

if topics:
    topic_queries = ['{}[Title/Abstract]'.format(topic) for topic in topics]
    queries.append('(' + ' OR '.join(topic_queries) + ')')

full_query = ' AND '.join(queries) + ' AND ' + date_range

In [7]:
# Search PubMed for relevant records
handle = Entrez.esearch(db='pubmed', retmax=100, term=full_query)
record = Entrez.read(handle)
id_list = record['IdList']

In [8]:
# DataFrame to store the extracted data
df = pd.DataFrame(columns=['PMID', 'Title', 'Abstract', 'Authors', 'Journal', 'Keywords', 'URL', 'Affiliations', "Pub Date"])

In [9]:
# Fetch information for each record in the id_list
for pmid in id_list:
    handle = Entrez.efetch(db='pubmed', id=pmid, retmode='xml')
    records = Entrez.read(handle)

    # Process each PubMed article in the response
    for record in records['PubmedArticle']:
        # Print the record in a formatted JSON style
        print(json.dumps(record, indent=4, default=str))  # default=str handles types JSON can't serialize like datetime
        title = record['MedlineCitation']['Article']['ArticleTitle']
        abstract = ' '.join(record['MedlineCitation']['Article']['Abstract']['AbstractText']) if 'Abstract' in record['MedlineCitation']['Article'] and 'AbstractText' in record['MedlineCitation']['Article']['Abstract'] else ''
        authors = ', '.join(author.get('LastName', '') + ' ' + author.get('ForeName', '') for author in record['MedlineCitation']['Article']['AuthorList'])
        
        affiliations = []
        for author in record['MedlineCitation']['Article']['AuthorList']:
            if 'AffiliationInfo' in author and author['AffiliationInfo']:
                affiliations.append(author['AffiliationInfo'][0]['Affiliation'])
        affiliations = '; '.join(set(affiliations))

        journal = record['MedlineCitation']['Article']['Journal']['Title']
        keywords = ', '.join(keyword['DescriptorName'] for keyword in record['MedlineCitation']['MeshHeadingList']) if 'MeshHeadingList' in record['MedlineCitation'] else ''
        url = f"https://www.ncbi.nlm.nih.gov/pubmed/{pmid}"
        
        pub_date = parse_pub_date(record['MedlineCitation']['Article']["Journal"]["JournalIssue"]["PubDate"])

        new_row = pd.DataFrame({
            'PMID': [pmid],
            'Title': [title],
            'Abstract': [abstract],
            'Authors': [authors],
            'Journal': [journal],
            'Pub Date': [pub_date],
            'Keywords': [keywords],
            'URL': [url],
            'Affiliations': [affiliations]
        })

        df = pd.concat([df, new_row], ignore_index=True)

{
    "MedlineCitation": {
        "InvestigatorList": [],
        "OtherAbstract": [],
        "KeywordList": [
            [
                "lambs",
                "rumen epithelia",
                "ruminal microbiota",
                "secoisolariciresinol diglucoside"
            ]
        ],
        "GeneralNote": [],
        "OtherID": [],
        "SpaceFlightMission": [],
        "CitationSubset": [
            "IM"
        ],
        "PMID": "39291551",
        "DateRevised": {
            "Year": "2024",
            "Month": "09",
            "Day": "18"
        },
        "Article": {
            "ELocationID": [
                "10.1002/jsfa.13909"
            ],
            "ArticleDate": [
                {
                    "Year": "2024",
                    "Month": "09",
                    "Day": "18"
                }
            ],
            "Language": [
                "eng"
            ],
            "Journal": {
                "ISSN": "1097-0010",
      

In [10]:
df

Unnamed: 0,PMID,Title,Abstract,Authors,Journal,Keywords,URL,Affiliations,Pub Date
0,39291551,Dietary secoisolariciresinol diglucoside crude...,"Flaxseed lignans, types of polyphenolic compou...","Liu Ning, Yu Shiqiang, Qu Jinrui, Tian Boya, L...",Journal of the science of food and agriculture,,https://www.ncbi.nlm.nih.gov/pubmed/39291551,Ruminant Nutrition and Feed Engineering Techno...,2024-Sep-18
1,39291428,HN1 expression contributes to mitotic fidelity...,Hematological and neurological expressed 1 (HN...,"Özduman Gülseren, Şimşek Faruk, Javed Aadil, K...","Cytoskeleton (Hoboken, N.J.)",,https://www.ncbi.nlm.nih.gov/pubmed/39291428,"Cancer Biology Laboratory, Department of Bioen...",2024-Sep-18
2,39291396,TRIM56 Modulates YBX1 Degradation to Ameliorat...,Spinal cord injury (SCI) is a severe injury to...,"Lou Junsheng, Mao Yiting, Jiang Wu, Shen Hongh...","Advanced science (Weinheim, Baden-Wurttemberg,...",,https://www.ncbi.nlm.nih.gov/pubmed/39291396,"Department of Orthopeadics, Affiliated Hangzho...",2024-Sep-18
3,39291269,Regulation of TGF-β1-induced fibroblast differ...,Human periodontal ligament stem cells (hPDLSCs...,"Ju Onyou, Ko Seon-Yle, Jang Young-Joo",Frontiers in cell and developmental biology,,https://www.ncbi.nlm.nih.gov/pubmed/39291269,Department of Nanobiomedical Science and BK21 ...,2024-Jan-01
4,39291186,APOL1 High-Risk Genotype is Not Associated Wit...,SARS-CoV-2 infection increases systemic inflam...,"Nystrom Sarah E, Soldano Karen L, Rockett Mick...",Kidney international reports,,https://www.ncbi.nlm.nih.gov/pubmed/39291186,Duke Clinical and Translational Science Instit...,2024-Sep-01
...,...,...,...,...,...,...,...,...,...
95,39286219,Advancements in diabetic foot ulcer research: ...,Diabetes represents a widely acknowledged glob...,"Wu ShuHui, Zhou ZhongSheng, Li Yang, Jiang Jinlan",Heliyon,,https://www.ncbi.nlm.nih.gov/pubmed/39286219,"Scientific Research Center, China-Japan Union ...",2024-Sep-15
96,39286150,Identification and analysis correlation betwee...,The occurrence of immunity and inflammation ou...,"Qiang Wang, Deng Wen Juan, Song Shu Ling, Pan ...",Heliyon,,https://www.ncbi.nlm.nih.gov/pubmed/39286150,"Department of Anesthesiology, Guangxi Medical ...",2024-Sep-15
97,39286058,"Randomised, phase 1/2a trial of ION-827359, an...",Hyperactivity of epithelial sodium channel (EN...,"Sutharsan Sivagurunathan, Fischer Rainald, Gle...",ERJ open research,,https://www.ncbi.nlm.nih.gov/pubmed/39286058,"Mukoviszidose-Zentrum München-West, Munich, Ge...",2024-Jul-01
98,39285786,"Flame retardant, hexabromocyclododecane, incre...",Hexabromocyclododecane (HBCD) is an environmen...,"Falconer-Turner April, Brooks Kameron, Ogaga E...",Journal of applied toxicology : JAT,,https://www.ncbi.nlm.nih.gov/pubmed/39285786,"Department of Chemistry, Tennessee State Unive...",2024-Sep-17


In [11]:
df.to_excel('PubMed_resultsx.xlsx', index=False)