In [None]:
# %load selenium_web_scraping.py
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
from selenium.webdriver.chrome.options import Options  
import pandas as pd

# configure the web browser
options = Options()  
options.add_argument("--headless")  
driver= webdriver.Chrome(options=options)
driver.set_window_size(1120, 550)

startTime = datetime.now()
# webpage which needs to be scrapted
url = "https://research.chalmers.se/en/organization/?tab=publications&f_pubtype=67383db1-533a-4ec6-8c58-6922e711b5c2&f_year=2019"
# set up main browser (which navigates to different pages)
driver_parent= webdriver.Chrome(options=options)
# set all results into results list
results = []

while True:
    print("working on the link of \n", url)
    driver_parent.get(url)
    # find all the links in the webpage (different publications)
    links = driver_parent.find_elements_by_xpath('//*[@id="tab-pane-publications"]/div[2]/div[1]/div/div[*]/h4/a')
    # loop all the links of publication
    for e in links:

            # open another browser for each publication
            # save the information into item_dict
            item_dict = {}
            driver= webdriver.Chrome(options=options)
            driver.get(e.get_attribute('href'))

            # extract abstract, if none, then set up as none
            try:
                item_dict['abstract'] = driver.find_elements_by_xpath('//*[@id="publication-abstract"]')[0].text
            except:
                item_dict['abstract'] = None
                
            # extract title, if none, then set up as none
            try:    
                item_dict['title'] = driver.find_elements_by_xpath('//*[@id="publication-title"]')[0].text.split('\n')[0]
            except:
                item_dict['title'] = None

            # extract keywords, if none, then set up as none
            try:    
                keywords = driver.find_elements_by_xpath('//*[@id="divPublicationPage"]/div[1]/div[1]/div[2]/*')
                item_dict['keywords'] = str([keyword.text for keyword in keywords])   
            except:
                item_dict['keywords'] = None

            # extract publisher, if none, then set up as none
            try:    
                publisher_div = driver.find_elements_by_xpath('//*[@id="divPublicationPage"]/div[2]/div/div[2]/div')
                publisher_data = publisher_div[0].find_element_by_tag_name('h4').text
                item_dict['publisher'] =  publisher_data
            except:
                item_dict['publisher'] =  None
            
            # extract authors which are working at Chalmers, if none, then set up as none
            # author can work at more than one univeristy or somewhere
            try:
                author_div = driver.find_elements_by_xpath('//*[@id="divPublicationPage"]/div[2]/div/div[1]/div/div/*')
                author_dict = {}

                for sub_div in author_div:
                    try:
                        author = sub_div.find_element_by_tag_name('h4').text
                       # print(" ") 
                        organisation_list = [organisation.text for organisation in sub_div.find_elements_by_tag_name('p') if
                                             'chalmers' in organisation.text.lower()]
                    except:
                        continue

                    if len(organisation_list) != 0:
                        author_dict[author] =  organisation_list

                item_dict['authors'] = author_dict
            except:
                item_dict['authors'] = None

            # save results
            results.append(item_dict)
            
            # close the publication browser
            driver.close()

    try:
        # click next page and get the url
        driver_parent.find_element_by_xpath('//*[@id="tab-pane-publications"]/div[2]/div[1]/ul/li[49]/a').click()
        url = driver_parent.current_url
        pd.DataFrame(results).to_pickle("result_webscraping_selenium")
    except:
        # if no more page, throw the exception
        print("done in page: \n", driver_parent.current_url)
        pd.DataFrame(results).to_pickle("result_webscraping_selenium")
        break
     
print("running time ", datetime.now() - startTime)
