In [6]:
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
import re
from selenium.webdriver.common.keys import Keys
from tqdm import tqdm
from pathlib import Path

In [7]:
def searchKeywords(driver, selector, keyword):
    search = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
    search.click()
    search.send_keys(keyword)
    search.send_keys(Keys.RETURN)
    
def getTitle(source, css_title, css_title2):
    soup = BeautifulSoup(source, 'lxml')
    titles_kor = soup.select(css_title)
    titles_eng = soup.select(css_title2)

    return titles_kor, titles_eng

def checkBlank(list: list):
    return "" if len(list) ==0 else list.text

def post_processing_text(text):
    return text.replace('\n', '').replace('\t', '') if text is not None else text

In [13]:
def kmbase_crawling(keyword, driver_dir = None):
    keyword_selector = "input#ctl13_KW"
    if driver_dir == None:
        driver = wd.Chrome(service=Service(ChromeDriverManager().install()))
        driver.get('https://kmbase.medric.or.kr')
    else:
        driver = wd.Chrome(service=Service(driver_dir))
        driver.get('https://kmbase.medric.or.kr')
        
    searchKeywords(driver, keyword_selector, keyword)
    selector_pages = "tr.gridePaging > td > table > tbody > tr > td"
    num:int = 1 
    title_list = []
    urls_list = []
    # Crawling urls
    while True:
        html_source = driver.page_source
        soup = BeautifulSoup(html_source, 'lxml')
        selector_title = "div.ch_pan > div.ch_co > ul > li:nth-child(1) > a"
        urls = [url + a['href'] for a in soup.select(selector_title)]
        urls_list.extend(urls)
        pages = driver.find_elements(By.CSS_SELECTOR, selector_pages)
        # Pagination
        if pages[num-1].text != "..." and pages[num-1].text == pages[-1].text:
            print("Crawling urls finished!")
            driver.quit()
            break
        elif pages[num].text == "...":
            pages[num].click()
            num = 3
        else:
            pages[num].click()
            num += 1

    # Crawling HTML sources 
    html_sources = []
    crawling_result_list = []
    for url in tqdm(urls_list, desc="Processing items"):
        driver = wd.Chrome(service=Service(ChromeDriverManager().install()))
        driver.get(url)
        html_source = driver.page_source
        html_sources.append(html_source)
    print("Crawling source finishhed!")
    
    # Crawling abstract info
    for source in tqdm(html_sources):
        selector_title = "div#abstractview > div.hd-inner > h2.hd-heading"
        selector_author = "div#abstractview > div > span#ctl14_ctl01_ctl00_authinfo"
        selector_journal = "div#abstractview > div > span.fontview"
        selector_doi = "div#abstractview > div.fontlink > span.fontlink > a"
        selector_abstract = "div#abstractview > div > span#ctl14_ctl01_ctl00_abstract_eng"
        soup = BeautifulSoup(source, 'lxml')

        title_list = soup.select(selector_title)
        author_list = soup.select(selector_author)
        journal_list = soup.select(selector_journal)
        doi_list = soup.select(selector_doi)
        abstract_list = soup.select(selector_abstract)

        for title, author, journal, doi, abstract in zip(title_list, author_list, journal_list, doi_list, abstract_list):
            title_edit = post_processing_text(title.text)
            author_edit = post_processing_text(author.text)
            doi_edit = post_processing_text(doi.text)
            journal_edit = post_processing_text(journal.text)
            abstract_dict = {"title": title_edit, "author": author_edit, "journal": journal_edit,
                            "doi": doi_edit, "abstract": abstract.text}
            crawling_result_list.append(abstract_dict)  

    df = pd.DataFrame(crawling_result_list)
    if ~Path("kmbase/").exists():
        Path.mkdir("kmbase/")
    df.to_csv(f"kmbase/kmbase_{keyword}.csv")
    print("Job finished!")

In [14]:
keyword = "Pharmacist"
driver_dir = '/Users/jyh/Downloads/chromedriver-mac-arm64/chromedriver'
kmbase_crawling(keyword=keyword,  driver_dir = driver_dir)

KeyboardInterrupt: 