**RESEARCH PAPER FETCHER**

Import all the necessary libraries

In [1]:
!pip install -r requirements.txt

Collecting poetry (from -r requirements.txt (line 2))
  Using cached poetry-2.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting biopython (from -r requirements.txt (line 4))
  Using cached biopython-1.84-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting cachecontrol<0.15.0,>=0.14.0 (from cachecontrol[filecache]<0.15.0,>=0.14.0->poetry->-r requirements.txt (line 2))
  Downloading cachecontrol-0.14.2-py3-none-any.whl.metadata (3.1 kB)
Collecting cleo<3.0.0,>=2.1.0 (from poetry->-r requirements.txt (line 2))
  Using cached cleo-2.1.0-py3-none-any.whl.metadata (12 kB)
Collecting dulwich<0.23.0,>=0.22.6 (from poetry->-r requirements.txt (line 2))
  Using cached dulwich-0.22.7-cp311-cp311-win_amd64.whl.metadata (4.5 kB)
Collecting fastjsonschema<3.0.0,>=2.18.0 (from poetry->-r requirements.txt (line 2))
  Downloading fastjsonschema-2.21.1-py3-none-any.whl.metadata (2.2 kB)
Collecting installer<0.8.0,>=0.7.0 (from poetry->-r requirements.txt (line 2))
  Using cached installer-0.7.0-py3-no

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-cloud-auth 0.1.4 requires pydantic<2.0, but you have pydantic 2.9.2 which is incompatible.
anaconda-cloud-auth 0.1.4 requires semver<3, but you have semver 3.0.2 which is incompatible.
langchain-core 0.1.52 requires packaging<24.0,>=23.2, but you have packaging 24.2 which is incompatible.
streamlit 1.30.0 requires packaging<24,>=16.8, but you have packaging 24.2 which is incompatible.


In [2]:
import argparse
import requests
import pandas as pd
from typing import List, Dict
from Bio import Entrez
import xml.etree.ElementTree as ET
import os
from concurrent.futures import ThreadPoolExecutor

Details required to access the data from PubMed API

In [3]:
Entrez.email = "your_email@example.com"  # Replace with your email for API acces
PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_SUMMARY_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"

Defining helper functions

In [4]:
# Function that searches the query from Pubmed and retrieves 100 results
def fetch_pubmed_ids(query: str, max_results: int = 100) -> List[str]:
    params = {
        "db": "pubmed",
        "term": query,
        "retmax": max_results,
        "retmode": "json"
    }
    response = requests.get(PUBMED_SEARCH_URL, params=params)
    response.raise_for_status()
    return response.json().get("esearchresult", {}).get("idlist", [])

In [5]:
# Fetching summaries of each paper from the papers list
def fetch_paper_summaries(pubmed_ids: List[str]) -> List[Dict[str, str]]:
    params = {
        "db": "pubmed",
        "id": ",".join(pubmed_ids),
        "retmode": "json"
    }
    response = requests.get(PUBMED_SUMMARY_URL, params=params)
    response.raise_for_status()
    data = response.json().get("result", {})
    return [
        {
            "PubmedID": paper_id,
            "Title": data.get(paper_id, {}).get("title", ""),
            "Publication Date": data.get(paper_id, {}).get("pubdate", "")
        }
        for paper_id in pubmed_ids
    ]

In [10]:
# Fetching Names of authors and email listed in each research paper
def fetch_paper_authors_batch(pubmed_ids: List[str]) -> Dict[str, List[Dict[str, str]]]:
    result = {}
    handle = Entrez.efetch(db="pubmed", id=",".join(pubmed_ids), rettype="xml", retmode="text")
    records = handle.read()
    root = ET.fromstring(records)

    for article in root.findall(".//PubmedArticle"):
        pmid = article.find(".//PMID").text
        authors_info = []
        for author in article.findall(".//AuthorList/Author"):
            last_name = author.find("LastName")
            fore_name = author.find("ForeName")
            if last_name is None or fore_name is None:
                continue
            author_name = f"{last_name.text} {fore_name.text}"
            affiliation = author.find("AffiliationInfo/Affiliation")
            email = None
            if affiliation is not None:
                affiliation = affiliation.text
                if "@" in affiliation:
                    email = affiliation.split()[-1]
            authors_info.append({
                "Author": author_name,
                "Affiliation": affiliation or "N/A",
                "Email": email
            })
        result[pmid] = authors_info
    return result

In [11]:
# Main function to fetch and save data
def main(query, output_folder=None, debug=False):
    if debug:
        print(f"Fetching papers for query: {query}")

    # Fetch PubMed IDs
    pubmed_ids = fetch_pubmed_ids(query)

    # Use multithreading to fetch summaries and authors concurrently
    with ThreadPoolExecutor() as executor:
        summaries_future = executor.submit(fetch_paper_summaries, pubmed_ids)
        authors_future = executor.submit(fetch_paper_authors_batch, pubmed_ids)
        summaries = summaries_future.result()
        authors_batch = authors_future.result()

    # Process the results
    results = []
    for summary in summaries:
        pubmed_id = summary["PubmedID"]
        authors = authors_batch.get(pubmed_id, [])
        non_academic_authors = [a["Author"] for a in authors if "university" not in (a["Affiliation"] or "").lower()]
        affiliations = {a["Affiliation"] for a in authors}

        summary.update({
            "Non-academic Author(s)": ", ".join(non_academic_authors),
            "Affiliation(s)": ", ".join(affiliations),
            "Author Email": next((a["Email"] for a in authors if a["Email"]), "N/A")
        })
        results.append(summary)

    # Determine the output folder
    if output_folder and os.path.isdir(output_folder):
        output_folder = output_folder
    else:
        sanitized_query = query.replace(" ", "_")
        output_folder = os.path.join(".", sanitized_query)
        os.makedirs(output_folder, exist_ok=True)

    # Save the CSV using the query name
    full_path = save_to_csv_with_query_name(results, query, output_folder)
    
    # Print completion message
    print(f"Results saved to: {full_path}")


In [12]:
# Function to save to CSV
def save_to_csv_with_query_name(papers, query, output_folder):
    sanitized_query = query.replace(" ", "_")
    filename = f"{sanitized_query}.csv"
    full_path = os.path.join(output_folder, filename)
    df = pd.DataFrame(papers)
    df.to_csv(full_path, index=False)
    return full_path

In [15]:
# Example usage
query = "Cancer Therapy"

# Run the main function
main(query)

Results saved to: .\Cancer_Therapy\Cancer_Therapy.csv
