In [1]:
# import modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
import ethnicolr as ec
from time import sleep
from pathlib import Path
import re

# open browser and go to directory
driver = str(Path().resolve().parents[2]) + "\Tools\geckodriver.exe"
browser = webdriver.Firefox(executable_path=driver)
browser.get('https://carleton.ca/phonebook/')

# find department selection box and submit button
submit_button = browser.find_element_by_id('submit')
departments_select = Select(browser.find_element_by_id('department-full'))
departments_labels = [o.text for o in departments_select.options][1:]

# dictionary to hold data
data = {
    'First Name' : [],
    'Last Name' : [],
    'Email' : []
}

# scrape all departments
for department_label in departments_labels:
    # select next department and click submit
    departments_select.select_by_visible_text(department_label)
    sleep(5)
    submit_button.click()
    sleep(5)

    # use BeautifulSoup to parse raw code
    soup=BeautifulSoup(browser.page_source,'lxml')

    # find data
    results = soup.find("div", id="results")

    # make sure results were found
    if results.find("p", text="No results found!"):
        print("No results found!")
        sleep(10)
        print()
        continue
    
    # obtain ALL contact info
    profs = results.find_all("h3")
    emails = results.find_all("a", href=re.compile(r"^mailto:"))
    
    print(len(profs), len(emails))

    # iterate through each person
    for prof, email in zip(profs, emails):
        # parse contact info
        fname = prof.get_text().split(" ")[0]
        lname = prof.get_text().split(" ")[-1]
        mail = email.get_text()
        
        # store contact info in data dictionary
        data['First Name'] += [fname]
        data['Last Name'] += [lname]
        data['Email'] += [mail]

        print(fname, lname, mail)

    # pause scraping for 10 seconds (standard)
    sleep(10)
    
    print()

# create pandas dataframe and remove any duplicate entries
df = pd.DataFrame(data)
df = df.drop_duplicates()
    
# determine ethnicity
df = ec.pred_wiki_name(df,list(df)[1],list(df)[0])
    
# print dataframe and write data to csv file
print(df)
df.to_csv("carleton parsed.csv")

# close browser
browser.quit()

Using TensorFlow backend.


1 1
Nicole Bedford Nicole.Bedford@carleton.ca

13 13
Jade Brayman Jade.Brayman@carleton.ca
Erin Currie Erin.Currie@carleton.ca
Sarah Doerksen Sarah.Doerksen@carleton.ca
Jen Hogan Jennifer.Hogan@carleton.ca
Mike Labreque Mike.Labreque@carleton.ca
Julia Lennon Julia.Lennon@carleton.ca
Nikki Mayville Nikki.Mayville@carleton.ca
Katie McCarlie Katie.Mccarlie@carleton.ca
Jennifer McCarthy Jennifer.Mccarthy@carleton.ca
Chaya Porter chaya.porter@carleton.ca
Chelsea Purcell Chelsea.Purcell@carleton.ca
Anna-Lynn Russell-Mercier Anna.Russellmercier@carleton.ca
Chelsea Whyte Chelsea.Whyte@carleton.ca

26 26
Mike Arundel Mike.Arundel@carleton.ca
Marilla Bender Marilla.Bender@carleton.ca
Lauren Boivin Lauren.Boivin@carleton.ca
Andrew Breedyk Andrew.Breedyk@carleton.ca
Jackie Carberry Jackie.Carberry@carleton.ca
Kristin Delaney Kristin.Delaney@carleton.ca
Marianne Ferguson Marianne.Ferguson@carleton.ca
Robert Finlayson Robert.Finlayson@carleton.ca
Blythe Fraser Blythe.Fraser@carleton.ca
Caroline Kara