In [1]:
'''
pip install biopython
pip install python-dotenv
pip install biopython python-dotenv
'''

'\npip install biopython\npip install python-dotenv\npip install biopython python-dotenv\n'

In [2]:
from Bio import Entrez
import time
import os
from dotenv import load_dotenv
import math
import calendar

In [None]:
# define the project directory
project_dir = os.path.dirname(os.getcwd())

# define data directory
data_dir = os.path.join(project_dir, 'data','PubMed_Format')


# Load environment variables (Note write the EMAIL & API_keys in Config/.env file)
dotenv_path = os.path.join(project_dir, 'config', '.env')
load_dotenv(dotenv_path=dotenv_path)

In [14]:


def search_and_download_abstracts(query, email, api_key, start_date, end_date, output_file):
    Entrez.email = email       # Set The email
    Entrez.api_key = api_key  # Set the API key
        
    search_handle = Entrez.esearch(db="pubmed",
                                   term=query,
                                   datetype="pdat",
                                   mindate= start_date,
                                   maxdate= end_date,
                                   usehistory="y")
    search_results = Entrez.read(search_handle)
    search_handle.close()

    count = int(search_results["Count"])
    
    # to avoid the API limit of 10000 records
    if count > 10000:
        return -1
    
    batch_size = 1000
    
    print(f"Found {count} results. Downloading abstracts... for year {start_date[:7]} -> {end_date[:7]}")

    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Check if file exists to determine mode ('a' for append, 'w' for write)
    file_mode = 'a' if os.path.isfile(output_file) else 'w'
    
    with open(output_file, file_mode) as out_file:
        if file_mode == 'a':
            out_file.write("\n")  # Ensure there's a newline before appending if file exists
        for start in range(0, count, batch_size):
            end = min(count, start + batch_size)
            print(f"Downloading record {start + 1} to {end} of {count}")
            fetch_handle = Entrez.efetch(db="pubmed",
                                         rettype="medline",
                                         retmode="text",
                                         retstart=start,
                                         retmax=batch_size,
                                         webenv=search_results["WebEnv"],
                                         query_key=search_results["QueryKey"])
            data = fetch_handle.read()
            fetch_handle.close()
            out_file.write(data)
            time.sleep(1)  # Respect PubMed's servers by sleeping 1 second between requests
    
    return count

In [16]:
total_number_of_records = 0
current_year_count = 0
current_month_count = 0

query = 'intelligence[Title/Abstract]'
email = os.environ["EMAIL"]
api_key = os.environ["API_KEY"]

for year in range(2013, 2025):
    output_file = os.path.join(project_dir, 'data', f'raw_pubmed_intelligence_abstracts_{year}.txt')
    current_year_count = search_and_download_abstracts(query, email, api_key,
                                                       start_date=f"{year}-01-01",
                                                       end_date=f"{year}-12-31",
                                                       output_file=output_file)
    
    # If the return is -1, it means the number of records is greater than 10,000
    if current_year_count == -1:
        for month in range(1, 13, 1):  # Start at 1, end before 13, increment by 1
            # Determine the last day of the month
            month_end_day = calendar.monthrange(year, month)[1]
            
            # Perform the search and download operation
            current_month_count = search_and_download_abstracts(query, email, api_key,
                                                                start_date=f"{year}-{month:02d}-01",
                                                                end_date=f"{year}-{month:02d}-{month_end_day}",
                                                                output_file=output_file)
            total_number_of_records += current_month_count
            
    else:
        total_number_of_records += current_month_count

# The total number of downloaded records : 63333 records
print(f"total number of downloaded records {total_number_of_records}")

Found 1660 results. Downloading abstracts... for year 2013-01 -> 2013-12
Downloading record 1 to 1000 of 1660
Downloading record 1001 to 1660 of 1660
Found 1798 results. Downloading abstracts... for year 2014-01 -> 2014-12
Downloading record 1 to 1000 of 1798
Downloading record 1001 to 1798 of 1798
Found 1974 results. Downloading abstracts... for year 2015-01 -> 2015-12
Downloading record 1 to 1000 of 1974
Downloading record 1001 to 1974 of 1974
Found 2011 results. Downloading abstracts... for year 2016-01 -> 2016-12
Downloading record 1 to 1000 of 2011
Downloading record 1001 to 2000 of 2011
Downloading record 2001 to 2011 of 2011
Found 2359 results. Downloading abstracts... for year 2017-01 -> 2017-12
Downloading record 1 to 1000 of 2359
Downloading record 1001 to 2000 of 2359
Downloading record 2001 to 2359 of 2359
Found 3341 results. Downloading abstracts... for year 2018-01 -> 2018-12
Downloading record 1 to 1000 of 3341
Downloading record 1001 to 2000 of 3341
Downloading record 2