In [None]:
#@title Literature_spider

import requests
from xml.etree import ElementTree
import pandas as pd
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display
import re

# Set pandas options to display all rows and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)  # Display only the first 10 rows for better readability

# Base URL for PubMed E-utilities API
base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'

# Current date and January 1 of five years ago
current_date = datetime.now().strftime('%Y/%m/%d')
five_years_ago_date = datetime(datetime.now().year - 5, 1, 1).strftime('%Y/%m/%d')

# Function to construct the query
def construct_query(keywords):
    return f'{keywords} AND ("{five_years_ago_date}"[PDAT] : "{current_date}"[PDAT])'

# Define the interactive form with a larger input field
keywords_input = widgets.Text(
    value='',
    placeholder='Enter search keywords',
    description='Keywords:',
    disabled=False,
    layout=widgets.Layout(width='50%')
)

output = widgets.Output()

def on_button_clicked(b):
    with output:
        output.clear_output()
        user_keywords = keywords_input.value
        safe_keywords = re.sub(r'\W+', '_', user_keywords)  # Replace non-alphanumeric characters with underscores

        # Create the query using user input
        query = construct_query(user_keywords)

        # Create an empty DataFrame to store the results
        df = pd.DataFrame(columns=['Title', 'URL', 'Authors', 'Journal', 'Publication Year'])

        # Inform the user that the search is in progress
        print("Searching PubMed for articles...")

        # Send a GET request to the ESearch utility to search for the query in PubMed
        response = requests.get(f'{base_url}esearch.fcgi', params={
            'db': 'pubmed',
            'term': query,
            'retmode': 'xml'
        })

        # If the request is successful, parse the XML response
        if response.status_code == 200:
            root = ElementTree.fromstring(response.text)
            # Get the list of PubMed IDs (PMIDs) from the response
            id_list = [id_elem.text for id_elem in root.iter('Id')]

            # Create a list to store the data
            data_to_append = []

            # Inform the user that article details are being retrieved
            print("Retrieving article details...")

            # For each PMID, send a GET request to the ESummary utility to retrieve the article details
            for pmid in id_list:
                response = requests.get(f'{base_url}esummary.fcgi', params={
                    'db': 'pubmed',
                    'id': pmid,
                    'retmode': 'xml'
                })
                if response.status_code == 200:
                    root = ElementTree.fromstring(response.text)
                    # Extract the title, URL, authors, journal, and publication year for each article and add them to the list
                    for docsum in root.iter('DocSum'):
                        title = docsum.find('Item[@Name="Title"]').text
                        url = f'https://pubmed.ncbi.nlm.nih.gov/{pmid}/'
                        authors = docsum.find('Item[@Name="Authors"]').text if docsum.find('Item[@Name="Authors"]') is not None else 'N/A'
                        journal = docsum.find('Item[@Name="FullJournalName"]').text if docsum.find('Item[@Name="FullJournalName"]') is not None else 'N/A'
                        pub_year = docsum.find('Item[@Name="PubDate"]').text.split()[0] if docsum.find('Item[@Name="PubDate"]') is not None else 'N/A'
                        data_to_append.append({'Title': title, 'URL': url, 'Authors': authors, 'Journal': journal, 'Publication Year': pub_year})

            # Concatenate the list of dictionaries to the DataFrame
            df = pd.concat([df, pd.DataFrame(data_to_append)], ignore_index=True)

        # Display the DataFrame preview
        print("\nSearch Results:")
        print(df.head(10))  # Display the first 10 rows of the DataFrame for better readability

        # Save the DataFrame as a CSV file with a timestamp and keywords
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f'literature_{safe_keywords}_{timestamp}.csv'
        df.to_csv(filename, index=False)
        print(f"\nResults saved to '{filename}'")

        display(df.head(10))  # Display the first 10 rows in the notebook

# Create and display the search button
search_button = widgets.Button(
    description='Search',
    disabled=False,
    button_style='',
    tooltip='Click to search PubMed',
    icon='search'
)

search_button.on_click(on_button_clicked)

# Display the form
display(keywords_input, search_button, output)
