# <font color=blue>Scraping Kellogg Faculty Pages.</font>

## Getting Started with Beautiful Soup
We will start by extracting individual faculty webpages from the index using __Beautiful Soup__


In [None]:
# First, import the necessary python libraries
import bs4 as bs
import urllib.request
import time

In [None]:
# This is the index page for faculty at Kellogg
url = 'https://www.kellogg.northwestern.edu/faculty/faculty_directory.aspx'
print(url)

In [None]:
# use the request library to access the source code
source = urllib.request.urlopen(url).read()

In [None]:
# create a soup and show the contents
soup = bs.BeautifulSoup(source, 'html.parser')
print(soup)

## Identify an HTML Tag

In [None]:
# find the html table with faculty profiles
faculty = soup.find('div',{'id':"bindFaculty"})
profs = faculty.findAll('h2',{'id':"facName"})

## Attributes of a Beautiful Soup object

In [None]:
# Lets look at the attributes of the faculty table
print(faculty.attrs)

# The profs object is actually a list
print(len(profs))

# print the html code for the first professor
print(profs[0])

# let's look at the attributes of this object
print(profs[0].attrs)


In [None]:
# extract the website for this faculty member
website = profs[0].find('a', href=True)
website = website['href']
print(website)

## Save results to a List Object

In [None]:
# save the full url for the first professor into an empty list object
website = 'https://www.kellogg.northwestern.edu' + str(website)
print(website)
prof_sites = []
prof_sites.append(website)
print(prof_sites)

<font color=blue>Try to save all of the professors' websites into a list object.</font>

In [None]:
prof_sites = []

for i in profs:
    website = i.find('a', href=True)
    website = website['href']
    website = 'https://www.kellogg.northwestern.edu' + str(website)
    print(website)
    prof_sites.append(website)

## Export results to a csv file

In [None]:
# Let's save the results of this variable to a csv file
import csv
with open('faculty_pages.csv', "w") as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     for site in prof_sites:
        wr.writerow([site])

## Throttling Requests (sleep times)

In [None]:
for i in prof_sites[0:3]:
    source2 = urllib.request.urlopen(i).read()
    print(i)
    time.sleep(3)

It seems odd that there are only 36 faculty members at Kellogg.

# Selenium to Click Buttons

Now let's use Selenium to click on the "More Faculty" button

In [None]:
# import selenium libraries and options
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
options = Options()
#options.set_headless(headless=True) # if you select this option selenium will run without opening a browser window

In [None]:
# create a new Firefox Session
driver = webdriver.Firefox(firefox_options=options)
driver.implicitly_wait(3)
driver.get(url)

In [None]:
# click open the "More Faculty" button
python_button = driver.find_element_by_link_text('MORE FACULTY')
python_button.click()

In [None]:
# end browser session
driver.quit()