In [5]:
# import modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import ethnicolr as ec
from pathlib import Path
import string
import sys
        
# dictionary to hold data
data = {
    'First Name' : [],
    'Last Name' : [],
    'Email' : []
}

# open browser and go to directory
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-extensions')
driver = str(Path().resolve().parents[2]) + "\Tools\chromedriver.exe"
browser = webdriver.Chrome(executable_path=driver, options=chrome_options)
timeout = 60
browser.get('https://www.mcgill.ca/directory/staff')

# iterate through all queries
for q in string.ascii_lowercase:
    # find last name field
    lastname_field = browser.find_element_by_id('edit-last')
        
    # make next query
    lastname_field.clear()
    lastname_field.send_keys(q + Keys.ENTER)
    
    # wait for page to load
    try:
        element_present = EC.presence_of_element_located((By.ID, 'mcgill-directory-results'))
        WebDriverWait(browser, timeout).until(element_present)
    except TimeoutException:
        sys.exit("Timed out waiting for page to load")

    # iterate through each person
    for link in [prof.find_element_by_tag_name('a').get_attribute('href') for prof in browser.find_element_by_xpath('//*[@id="mcgill-directory-results"]/tbody').find_elements_by_tag_name('tr')]:
        # go to personal page link
        browser.get(link)
        
        # wait for page to load
        try:
            element_present = EC.presence_of_element_located((By.XPATH, '//*[@id="block-system-main"]/div/div'))
            WebDriverWait(browser, timeout).until(element_present)
        except TimeoutException:
            sys.exit("Timed out waiting for page to load")
            
        # obtain contact info
        hyperlink = browser.find_element_by_xpath('//*[@id="block-system-main"]/div/div').find_element_by_tag_name('a')
        if hyperlink == None:
            continue
        full_name = hyperlink.text.split()
        fname = full_name[0]
        lname = full_name[-1]
        email = hyperlink.get_attribute('href')[7:]
        
        # store contact info in data dictionary
        data['First Name'] += [fname]
        data['Last Name'] += [lname]
        data['Email'] += [email]

        # print contact info
        print(fname, lname, email)
        
    # go back to directory
    browser.get('https://www.mcgill.ca/directory/staff')
    
    # wait for page to load
    try:
        element_present = EC.presence_of_element_located((By.ID, 'edit-last'))
        WebDriverWait(browser, timeout).until(element_present)
    except TimeoutException:
        sys.exit("Timed out waiting for page to load")

# close browser
browser.quit()

# create pandas dataframe and remove any duplicate entries + fix index
df = pd.DataFrame(data)
df = df.drop_duplicates().reset_index(drop=True)
    
# determine ethnicity
df = ec.pred_wiki_name(df,list(df)[1],list(df)[0])
    
# print dataframe and write data to csv file
print(df)
df.to_csv("mcgill parsed.csv")

Armand Aalamian ARMAND.AALAMIAN@MCGILL.CA
New search /www.mcgill.ca/directory/
Helen Aaron HELEN.AARON@MCGILL.CA
Kevork Abadjian KEVORK.ABADJIAN@MCGILL.CA
Nancy Abate IRBSEC.MED@MCGILL.CA
Elena Abbandonato ELANA.ABBANDONATO@MCGILL.CA
Colin Abbott COLIN.ABBOTT@MCGILL.CA
Frances Abbott FABBOTT@PSYCH.MCGILL.CA
Jennifer Abbott JENNIFER.ABBOTT@MCGILL.CA
Hala Abdallah ACADADMIN1.MED@MCGILL.CA
Wamied Abdel-Rahman WABDEL@MEDPHYS.MCGILL.CA
Wynne Abdon WYNNE.ABDON@MCGILL.CA
Monica Abdou MONICA.ABDOU@MCGILL.CA
Masud Abdullah MD.AL-MASUD@MCGILL.CA
David Aberbach DAVID.ABERBACH@MCGILL.CA
Carole Farah CAROLE.ABIFARAH@MCGILL.CA
Antoine Abi-Kheres ANTOINE.ABI-KHERES@MCGILL.CA
Samer Abi-Nader SAMER.ABI-NADER@MCGILL.CA
Malek Abisaab MALEK.ABISAAB@MCGILL.CA
Rula Abisaab RULA.ABISAAB@MCGILL.CA
Sharon Abish SHARON.ABISH@MUHC.MCGILL.CA
Arash Abizadeh ARASH.ABIZADEH@MCGILL.CA
Mark Abley MARK.ABLEY@MCGILL.CA
Deborah Abner DEBORAH.ABNER@MCGILL.CA
Tania Younes TANIA.ABOUYOUNES@MCGILL.CA
Frances Aboud FRANCES.AB