In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import time
from selenium.webdriver.support import expected_conditions as EC
from creds import *
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import os
import numpy as np

In [2]:
def wait_and_click_element(selector, by=By.CSS_SELECTOR, sleep_for=3):
    try:
        time.sleep(sleep_for)
        element = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((by, selector))
        )
        element.click()

    except Exception as e:
        print("Error: ", e)

def get_element(selector, by=By.ID, sleep_for=3):
    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((by, selector))
        )
        time.sleep(sleep_for)
        return element

    except Exception as e:
        print("Error: ", e)

def get_all_elements(selector, by=By.ID, sleep_for=3):
    try:
        elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((by, selector))
        )
        time.sleep(sleep_for)
        return elements

    except Exception as e:
        print("Error: ", e)

def set_value_to_element(element, value, sleep_for=3):
    element.clear()
    time.sleep(sleep_for)
    element.clear()
    try:
        element.send_keys(value)
    except Exception as e:
        print("Error: ", e)

def let_the_page_load():
    while driver.execute_script("return document.readyState;") != "complete":
        time.sleep(1)

def wait_for_a_bit(value=5):
    time.sleep(value)

In [3]:
# start chrome driver and go to lexis nexis site on ubc website
driver = webdriver.Chrome()
driver.get('https://resources.library.ubc.ca/page.php?id=2472')

In [4]:
# hit button to go to cwl
let_the_page_load()
wait_and_click_element("a.green.medium.button")

In [5]:
# set username and password for cwl
let_the_page_load()
username = get_element("username", sleep_for=1)
password = get_element("password", sleep_for=1)
set_value_to_element(username, uname, sleep_for=1)
set_value_to_element(password, pwd, sleep_for=1)
wait_and_click_element("_eventId_proceed", By.NAME)

In [6]:
# weirdly, i have to reload the page for it to work
wait_for_a_bit(20)
let_the_page_load()
driver.refresh()

## Search keyword and date range modification

In [7]:
search = "community"

In [8]:
let_the_page_load()
search_keyword = get_element("lng-expanding-textarea[contenteditable='true']", By.CSS_SELECTOR, sleep_for=0)
set_value_to_element(search_keyword, search, sleep_for=0)
wait_and_click_element('//button[@aria-label="Search"]', By.XPATH)

## Setting the filters

In [9]:
# clearing the filters
let_the_page_load()
wait_and_click_element("button.clear-filters[data-action='clear']", By.CSS_SELECTOR)

In [10]:
new_min_date = "01/01/1960"
new_max_date = "12/31/1980"
wait_for_a_bit()

In [11]:
timeline_element = get_element("podfiltersbuttondatestr-news", By.ID)
timeline_element.click()
wait_for_a_bit()

In [12]:
timeline_element = get_element("podfiltersbuttondatestr-news", By.ID)
timeline_element.send_keys(Keys.TAB, Keys.TAB, Keys.TAB, new_min_date, Keys.TAB, Keys.TAB, new_max_date)
ok_button = wait_and_click_element(".//button[contains(@class, 'save')]", By.XPATH)
wait_for_a_bit()

---

In [13]:
wait_and_click_element("podfiltersbuttonpublicationtype", By.ID)
wait_for_a_bit()
wait_and_click_element(selector='ul[data-id="publicationtype"] li.sel-multi button[data-action="selmulti"]', by=By.CSS_SELECTOR)

In [13]:
wait_and_click_element("#podfiltersbuttonen-geography-news")
wait_for_a_bit()
wait_and_click_element("label[for='_tmnyk_pf51']")
wait_for_a_bit(10)

In [14]:
wait_and_click_element("#podfiltersbuttonlanguage")
wait_for_a_bit()
wait_and_click_element("label[for='_tmnyk_pf52']", By.CSS_SELECTOR)
wait_for_a_bit(13)

In [15]:
wait_and_click_element("#podfiltersbuttonsource")
wait_for_a_bit()
wait_and_click_element(selector='ul[data-id="source"] li.sel-multi button[data-action="selmulti"]', by=By.CSS_SELECTOR)

In [16]:
source_names_to_select = ["New York Times", "Guardian"]

let_the_page_load()

for source_name in source_names_to_select:
    try:
        xpath = "//li[label[contains(.,'" + source_name + "')]]/label/input[@type='checkbox']"
        wait_and_click_element(xpath, By.XPATH)
    except Exception as e:
        print(f"Error with {source_name}: ", e)

wait_and_click_element(selector=".button.primary.saveaddmultifilter", by=By.CSS_SELECTOR)
wait_for_a_bit(10)

## Extract articles on each page

In [13]:
index = 0
page = 358
csv_file_path = '../data/data_1960_to_1980.csv'

In [None]:
while True:
    article_xpaths = [f'/html/body/div[1]/div/main/div[2]/div/div[2]/div[2]/form/div[2]/ol/li[{i}]/div/h2/a' for i in range(0, 21, 2)]
    
    for xpath in article_xpaths:
        try:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath))).click()
        except Exception as e:
            index += 1
            print(f"page: {page}, article: {index}")
            print('cannot click this path')
            # print(xpath)
            continue
        try:   
            title = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "SS_DocumentTitle"))
            ).text
            doc_info = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "SS_DocumentInfo"))
            )
            source, date = doc_info[0].text, doc_info[1].text
            body_elements = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "SS_LeftAlign"))
            )
            
            body = ''.join([p.text for p in body_elements.find_elements(By.CSS_SELECTOR, "p")[1:]])
            data = pd.DataFrame([[title, source, date, body]], columns=['Title', 'Source', 'Date', 'Body'])
            include_header = not os.path.exists(csv_file_path)
            data.to_csv(csv_file_path, mode='a', header=include_header, index=False)
            
            index += 1
            print(f"page: {page}, article: {index}")
            # print(xpath)
            print(date, title)
            
            driver.back()  
        except Exception as e:
            index += 1
            print(f"page: {page}, article: {index}")
            print("Error with xpath")    

            driver.back()
    
    try:
        next_page_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//a[@aria-label='Next']"))
        )
        next_page_button.click()
        index = 0
        page += 1
    except Exception as e:
        print("No more pages or encountered an error navigating to the next page:", e)
        break

page: 358, article: 1
cannot click this path
page: 358, article: 2
September 13, 1980, Saturday, Late City Final Edition SOUTHERN PACIFIC MERGER WITH SANTE FE CALLED OFF
page: 358, article: 3
August 31, 1980, Sunday, Late City Final Edition G.O.P. LEADERS ARE EMBARRASED BY A RACIST NOMINEE IN MICHIGAN
page: 358, article: 4
August 23, 1980, Saturday, Late City Final Edition GOING OUT GUIDE
page: 358, article: 5
June 27, 1980, Friday, Late City Final Edition NEWPORT JAZZ SWINGS INTO TOWN
page: 358, article: 6
July 30, 1980, Wednesday, Late City Final Edition MUST BUDGET AX FALL ON CHILDREN?
page: 358, article: 7
July 8, 1980, Tuesday, Late City Final Edition N.A.A.C.P. STRUGGLING TO REGAIN STATURE;
News Analysis
page: 358, article: 8
June 30, 1980, Monday, Late City Final Edition I.C.C. PLANS TRUCK RULES BY AUTUMN
page: 358, article: 9
June 29, 1980, Sunday, Late City Final Edition The Coal Goal: How?
page: 358, article: 10
June 30, 1980, Monday, Late City Final Edition SENATOR HOLLINGS 

In [18]:
# driver.quit()