In [None]:
# Commenting out below code as PubMed can block requests if usage limit is exceeded.
# Hence, ran below code once, downloaded the csv on my local system and uploading it for every session of colab
import requests
import time
from xml.etree import ElementTree as ET
import pandas as pd
import csv
import xml

# Constants
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
EMAIL = "shreya.chandra.24@gmail.com"  # Replace with your registered email
TOOL = "beautyRAGtool"
API_KEY = "abc"  # Please email me at shrchandra@ucdavis.edu for the API key. Or else, you can obtain an API key by logging in to NCBI website through UC Davis institutional access
RATE_LIMIT = 4  # requests per second
MIN_INTERVAL = 1.0 / RATE_LIMIT  # seconds

_last_request_time = 0  # Global tracker for throttling

def rate_limited_get(url, params):
    """Wrapper for requests.get with rate limiting"""
    global _last_request_time
    elapsed = time.time() - _last_request_time
    if elapsed < MIN_INTERVAL:
        time.sleep(MIN_INTERVAL - elapsed)
    response = requests.get(url, params=params)
    _last_request_time = time.time()
    return response

def rate_limited_post(url, params):
    """Wrapper for requests.get with rate limiting"""
    global _last_request_time
    elapsed = time.time() - _last_request_time
    if elapsed < MIN_INTERVAL:
        time.sleep(MIN_INTERVAL - elapsed)
    response = requests.post(url, data=params)
    _last_request_time = time.time()
    return response

def search_pubmed(term, retmax=3000) -> list:
    """Search PubMed and return a list of PMIDs"""
    url = f"{BASE_URL}esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": term,
        "retmax": retmax,
        "retmode": "xml",
        "tool": TOOL,
        "email": EMAIL,
        "api_key": API_KEY
    }
    response = rate_limited_get(url, params)
    root = ET.fromstring(response.text)
    return [id_elem.text for id_elem in root.findall(".//Id")]

def fetch_abstracts(pmids):
    """Fetch abstracts and metadata for a list of PMIDs"""

    url = f"{BASE_URL}efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": ",".join(pmids),
        "retmode": "xml",
        "rettype": "abstract",
        "tool": TOOL,
        "email": EMAIL,
        "api_key": API_KEY
    }
    response = rate_limited_post(url, params)
    print(response.status_code)
    print(f"\n---Response (Abstract): {response.text} ---\n")

    root = ET.fromstring(response.text)
    articles = []

    for article in root.findall(".//PubmedArticle"):
        title = article.findtext(".//ArticleTitle", default="N/A")
        abstract = article.findtext(".//AbstractText", default="N/A")
        language = article.findtext(".//Language", default="N/A")
        if language.lower() != "eng":
            continue
        pmid = article.findtext(".//PMID", default="N/A")
        # print(pmid)
        all_parts_article_abstract = []
        each_part_article_abstract = ""
        for root in article.iter('AbstractText'):
            each_part_article_abstract += (str(root.attrib.get('Label', 'Abstract')) + ": " + str(root.text) + " ")
        all_parts_article_abstract.append(each_part_article_abstract)
        articles.append(all_parts_article_abstract)

    return articles

# Example Usage
if __name__ == "__main__":
    pmids = search_pubmed("dermatological formulations side effects")
# I tried many search keywords. But this gave most number of relevant articles.
# Other keywords I tried were: cosmetics products harmful effects, cosmetics products side effects, unsafe ingredients beauty products
    rag_docs = fetch_abstracts(pmids)
    # print(rag_docs)
    file = open("/content/sample_data/PubMed_articles.csv", "w+", newline = '')
    with file:
        write = csv.writer(file)
        write.writerows(rag_docs)
