In [40]:
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import numpy as np
import pandas as pd
import requests
import re
from selenium.webdriver.common.keys import Keys
from tqdm import tqdm
from pathlib import Path


In [41]:
def searchNormal(driver, keyword:str):
    selector_search = "input#id_term"
    search = driver.find_element(By.CSS_SELECTOR, selector_search)
    search.click()
    search.send_keys(keyword)
    search.send_keys(Keys.ENTER)

def searchAdvance(driver, keyword:str):
    to_advance = driver.find_element_by_css_selector('a.search-input-link')
    to_advance.click()

    # title과 abstract으로 설정
    title_abstrac = driver.find_element_by_css_selector('select#field-selector')
    title_abstrac.click()
    opt = driver.find_element_by_css_selector('#field-selector > option:nth-child(39)')
    opt.click()

    # Input Keywords
    search = driver.find_element_by_css_selector('input#id_term')
    search.click()
    search.send_keys(keyword)
    search.send_keys(Keys.ENTER)

    searchBtn = driver.find_element_by_css_selector('#search-form > div > div > div.query-box-section-wrapper > div.button-wrapper > button')
    searchBtn.click()

def checkBlank(list: list):
    return "" if len(list) ==0 else list.text

def post_processing_text(text):
    return text.strip().replace('\n', '').replace('\t', '') if text is not None else text

In [74]:
def pubmed_crawling(keyword, advance = False):
    driver = wd.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get("https://pubmed.ncbi.nlm.nih.gov") 

    # Keywords input
    if advance: # advance search
        searchAdvance(driver, keyword)
    else:
        searchNormal(driver, keyword)

    # Page to the end
    selector_show_more_btn = "#search-results > section > div.search-results-paginator.next-results-paginator.has-nav > button"
    show_more_btn = driver.find_elements(By.CSS_SELECTOR, selector_show_more_btn)
    while True:
        try:
            show_more_btn = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector_show_more_btn)))
            show_more_btn.click()
        except :
            print("The last page")
            break

    # Collect URLs
    url = "https://pubmed.ncbi.nlm.nih.gov"
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'lxml')
    selector_title = "div.search-results-chunk > article.full-docsum > div.docsum-wrap > div.docsum-content > a.docsum-title"
    urls = [url + a['href'] for a in soup.select(selector_title)]

    # Crawling HTML sources
    html_sources = []

    for url in tqdm(urls, desc="Processing HTML sources"):
        driver = wd.Chrome(service=Service(ChromeDriverManager().install()))
        driver.get(url)
        html_source = driver.page_source
        html_sources.append(html_source)
    print("Crawling source finishhed!")

    # Crawling abstracts
    crawling_result_list = []
    selector_title = "h1.heading-title"
    selector_author = "div.inline-authors > .authors > .authors-list >  span.authors-list-item > a.full-name"
    selector_journal = "div.article-citation > div > div > button"
    selector_year = "div.article-citation > div.article-source > span.cit"
    selector_doi = "#full-view-identifiers > li:nth-child(3) > span > a.id-link"
    selector_abstract = "div.abstract-content"
    for source in tqdm(html_sources, desc="Processing abstracts"):
        soup = BeautifulSoup(source, 'lxml')
        title_list = soup.select(selector_title);
        author_list = soup.select(selector_author)
        journal_list = soup.select(selector_journal)
        year_list = soup.select(selector_year)
        doi_list = soup.select(selector_doi) 
        abstract_list = soup.select(selector_abstract)
        
        title_edit = post_processing_text(title_list[0].text) 
        author_edit = post_processing_text(author_list[0].text) if len(author_list) != 0 else ""
        journal_edit = post_processing_text(journal_list[0].text) if len(journal_list) != 0 else ""
        year_edit = post_processing_text(year_list[0].text) if len(year_list) != 0 else ""
        doi_edit = post_processing_text(doi_list[0].text) if len(doi_list) != 0 else ""
        abstract_edit = post_processing_text(abstract_list[0].text) if len(abstract_list)!=0 else ""
        abstract_dict = {"title": title_edit, "author": author_edit + "et al.", "journal": journal_edit,
                        "year": year_edit, "doi": doi_edit, "abstract": abstract_edit}
        crawling_result_list.append(abstract_dict)  
    #     for title, author, journal, year, doi, abstract in zip(title_list, author_list, journal_list, year_list, doi_list, abstract_list):

    # Save to CSV
    df = pd.DataFrame(crawling_result_list)
    if not Path("Pubmed/").exists():
        Path.mkdir("Pubmed/")
    else:
        pass
    df.to_csv(f"Pubmed/Pubmed_{keyword}.csv")
    print("Job finished!")

In [75]:
keyword = "AKI AND sepsis AND bundle"
result = pubmed_crawling(keyword = keyword)

The last page


Processing HTML sources: 100%|██████████| 16/16 [01:16<00:00,  4.76s/it]


Crawling source finishhed!


Processing abstracts: 100%|██████████| 16/16 [00:01<00:00, 12.93it/s]

Job finished!



