In [1]:
# import modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import ethnicolr as ec
from pathlib import Path
import string
import sys
from time import sleep

# list to hold queries
test = ["Faculty",
        "Professor",
        "Lecturer",
        "Instructor",
        "Fellow",
        "Research",
        "Scientist",
        "Fellow",
        "Supervisor",
        "Assistant",
        "Secretary",
        "Officer",
        "Dean",
        "Director",
        "Chair",
        "Coordinator",
        "President"]
        
# dictionary to hold data
data = {
    'First Name' : [],
    'Last Name' : [],
    'Email' : []
}

# open browser and go to directory
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-extensions')
driver = str(Path().resolve().parents[2]) + "\Tools\chromedriver.exe"
browser = webdriver.Chrome(executable_path=driver, options=chrome_options)
timeout = 60
browser.get('https://atlas.cookie.uit.yorku.ca/atlas/servlet/atlas')

# iterate through all queries
for q in test:
    # find title field, search button and reset button
    title_field = browser.find_element_by_id('SearchTitle')
    search_button = browser.find_element_by_xpath('//*[@id="SearchForm"]/span/table/tbody/tr[8]/td[3]/input[1]')
    reset_button = browser.find_element_by_xpath('//*[@id="SearchForm"]/span/table/tbody/tr[8]/td[3]/input[2]')
        
    # make next query
    reset_button.click()
    title_field.send_keys(q)
    search_button.click()
    
    # wait for page to load
    try:
        element_present = EC.presence_of_element_located((By.XPATH, '/html/body/table[2]/tbody/tr/td/table/tbody/tr[3]/td/span'))
        WebDriverWait(browser, timeout).until(element_present)
    except TimeoutException:
        sys.exit("Timed out waiting for page to load")
        
    sleep(1)
    
    # check number of results
    results = browser.find_element_by_xpath('/html/body/table[2]/tbody/tr/td/table/tbody/tr[3]/td/span').text
    
    # if results are found
    if results != "0 results found":
        counter = 4
        # iterate through each person
        while True:
            '''
            catch error when invalid xpath is entered
            (all profs' contact info has been obtained)
            '''
            prof = None
            try:
                prof = browser.find_element_by_xpath('/html/body/table[2]/tbody/tr/td/table/tbody/tr[%d]/td[1]/span/b/a' %counter)
            except NoSuchElementException:
                break
                
            # go to contact page of current page
            prof.click()
            
            # wait for page to load
            try:
                element_present = EC.presence_of_element_located((By.XPATH, '/html/body/table[2]/tbody/tr/td/table/tbody/tr[1]/td[2]/p'))
                WebDriverWait(browser, timeout).until(element_present)
            except TimeoutException:
                sys.exit("Timed out waiting for page to load")
                
            sleep(1)

            # obtain contact info and clean it
            full_name = browser.find_element_by_xpath('/html/body/table[2]/tbody/tr/td/table/tbody/tr[1]/td[2]/p').text.split(" ")
            if "(on leave)" in (' '.join(full_name)).lower():
                del full_name[-1]
                del full_name[-1]
            if "(on extended leave)" in (' '.join(full_name)).lower():
                del full_name[-1]
                del full_name[-1]
                del full_name[-1]
            if "Dr. " in full_name[0]:
                full_name[0] = full_name[0][4:]
            if "Dr " in full_name[0]:
                full_name[0] = full_name[0][3:]
            fname = full_name[0]
            lname = full_name[-1]
            email = None
            try:
                email = browser.find_element_by_xpath('/html/body/table[2]/tbody/tr/td/table/tbody/tr[3]/td[2]/span/b/a').text.split()
                if email == []:
                    email = "No Email"
                else:
                    email = email[0]
                    if email[len(email)-1] == ",":
                        email = email[:len(email)-1]
            except NoSuchElementException:
                email = "No Email"
            
            # store contact info in data dictionary
            data['First Name'] += [fname]
            data['Last Name'] += [lname]
            data['Email'] += [email]
    
            # print contact info
            print(fname, lname, email)
            
            # increment xpath counter by 1 and go back to results page
            browser.execute_script("window.history.go(-1)")
            counter += 1
            
            # wait for page to load
            try:
                element_present = EC.presence_of_element_located((By.XPATH, '/html/body/table[2]/tbody/tr/td/table/tbody/tr[3]/td/span'))
                WebDriverWait(browser, timeout).until(element_present)
            except TimeoutException:
                sys.exit("Timed out waiting for page to load")
                
            sleep(1)
                
    # go back to directory page
    browser.execute_script("window.history.go(-1)")
        
    # wait for page to load
    try:
        element_present = EC.presence_of_element_located((By.XPATH, '//*[@id="SearchForm"]/span/table/tbody/tr[8]/td[3]/input[1]'))
        WebDriverWait(browser, timeout).until(element_present)
    except TimeoutException:
        sys.exit("Timed out waiting for page to load")

    sleep(1)

# close browser
browser.quit()

# create pandas dataframe and remove any duplicate entries
df = pd.DataFrame(data)
df = df.drop_duplicates()
    
# determine ethnicity
df = ec.pred_wiki_name(df,list(df)[1],list(df)[0])
    
# print dataframe and write data to csv file
print(df)
df.to_csv("york parsed.csv")

Using TensorFlow backend.


Chantal Abouchar chantala@yorku.ca
Barbara Ackerman barbara1@yorku.ca
Karin Adlhoch kadlhoch@yorku.ca
R. Albright albright@yorku.ca
Joan Allen jallen@yorku.ca
Julie Allen allenj@yorku.ca
Oswald Almasi oalmasi@yorku.ca
Motti Anafi moanafi@yorku.ca
Gordon Anderson ganderso@yorku.ca
Gordon Anderson ganderso@yorku.ca
Richard Anderson anderson@yorku.ca
Themistoklis Aravossitas travoss@yorku.ca
Julian Arend julianm@yorku.ca
Sharon Armstrong sarm@yorku.ca
Luke Arnason arnason@yorku.ca
Alireza Asgharzadeh alirezaa@yorku.ca
Nick Ashby ashby@yorku.ca
Kirk Atkinson kirka@yorku.ca
Ehud Avitzur eavitzur@yorku.ca
Dan Azoulay dazoulay@yorku.ca
Laura Ball lcb@yorku.ca
Raluca Barac barac@yorku.ca
Mary Barbieri mbarbieri@osgoode.yorku.ca
Vassilios Bardis vbardis@yorku.ca
Annmarie Barnes barnesa@yorku.ca
Elena Basile ebasile@yorku.ca
Charles Battershill cbatters@yorku.ca
Jennifer Bazar jlbazar@yorku.ca
William Beauvais beauvais@yorku.ca
Frances Beer fran@yorku.ca
John Bell johnbell@yorku.ca
Gillian Beres