In [1]:
# import modules
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
import ethnicolr as ec
from time import sleep
from pathlib import Path

# open browser and go to directory
driver = str(Path().resolve().parents[2]) + "\Tools\geckodriver.exe"
browser = webdriver.Firefox(executable_path=driver)
browser.get('https://www.ryerson.ca/contact/')

# find department selection box and submit button
page = browser.find_element_by_xpath('//*[@id="mainContent"]/div[3]/div[2]/div/div[1]/div/div[5]').find_element_by_tag_name("div")
submit_button = page.find_element_by_tag_name("button")
departments_select = Select(page.find_element_by_tag_name("select"))
departments_labels = list([o.text for o in departments_select.options])

# dictionary to hold data
data = {
    'First Name' : [],
    'Last Name' : [],
    'Email' : []
}

# scrape all departments
for department_label in departments_labels:
    # refresh web elements
    page = browser.find_element_by_xpath('//*[@id="mainContent"]/div[3]/div[2]/div/div[1]/div/div[5]').find_element_by_tag_name("div")
    submit_button = page.find_element_by_tag_name("button")
    departments_select = Select(page.find_element_by_tag_name("select"))

    # select next department and click submit
    departments_select.select_by_visible_text(department_label)
    submit_button.click()
    
    # pause scraping for 10 seconds (standard)
    sleep(10)

    # use BeautifulSoup to parse raw code
    soup=BeautifulSoup(browser.page_source,'lxml')
    
    # find results
    profs = soup.find("div", id="peopleResults")
    profs = profs.find("ul", class_="searchresults")
    
    # iterate through each person
    for prof in profs.find_all("li"):
        # obtain contact info
        full_name = prof.find("h3", class_="search_p_name").get_text().split()
        lname = full_name[0][:len(full_name[0])-1].title()
        fname = full_name[1].title()
        if fname == "Dr." or fname == "Dr":
            fname = full_name[2].title()
        email = prof.find("div", class_="search_p_email")
        if email == None:
            email = "No Email"
        else:
            email = email.find("a").get_text()
        
        # store contact info in data dictionary
        data['First Name'] += [fname]
        data['Last Name'] += [lname]
        data['Email'] += [email]
    
        # print contact info
        print(fname, lname, email)
 
    # go back to directory page
    browser.execute_script("window.history.go(-2)")
    
    # pause scraping for 10 seconds (standard)
    sleep(10)
        
# create pandas dataframe and remove any duplicate entries
df = pd.DataFrame(data)
df = df.drop_duplicates()
    
# determine ethnicity
df = ec.pred_wiki_name(df,list(df)[1],list(df)[0])
    
# print dataframe and write data to csv file
print(df)
df.to_csv("ryerson parsed.csv")

# close browser
browser.quit()

Using TensorFlow backend.


Cyndy Baskin cbaskin@ryerson.ca
Cheryl Trudeau cktrudeau@ryerson.ca
Monica Mckay mmckay@ryerson.ca
Enquiries General ssaikkon@ryerson.ca
Thunder Alphonse thunderalphonse@ryerson.ca
Samantha Mandamin smandamin@ryerson.ca
Brian Norton bnorton@ryerson.ca
Sheila Saikkonen ssaikkon@ryerson.ca
Diane Simone dsimone@ryerson.ca
Enquiries General aasadmin@ryerson.ca
Kate Cressman kate.cressman@ryerson.ca
Brenda Ferguson brenda.ferguson@ryerson.ca
Stefanie Ferrante stefanie.ferrante@ryerson.ca
Danielle Kandel-Lieberman dkandellieberman@ryerson.ca
Gafira Kassam gkassam@ryerson.ca
Vanessa Leblond vanessa.leblond@ryerson.ca
Amanda Masterton amanda.masterton@ryerson.ca
Mandy Sandhu m6sandhu@ryerson.ca
Leslie Simpson leslie.simpson@ryerson.ca
Karen Stevenson karen.stevenson@ryerson.ca
Maria Taylor maria.taylor@ryerson.ca
Sydney Tran sydney.tyber@ryerson.ca
John Woodley john.woodley@ryerson.ca
Enquiries General gripe@ryerson.ca
John Foxe johnpaul.foxe@ryerson.ca
Suzanne Hicks suzanne.hicks@ryerson.ca
A