In [21]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions

In [14]:
def get_raw_cape_dataframe():
    
    # launch browser using Selenium, need to have Firefox installed
    print('Opening a browser window...')
    s=Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(service=s)
    print('Browser window open, loading the page...')

    # get the page that lists all the data, first try
    driver.get('https://cape.ucsd.edu/responses/Results.aspx')
    print('Please enter credentials...')

    # wait until SSO credentials are entered
    wait = WebDriverWait(driver, 60)
    element = wait.until(expected_conditions.title_contains('Course And Professor Evaluations (CAPE)'))

    # get the page that lists all the data
    # (%2C is the comma, drops all the data since every professor name has it)
    driver.get('https://cape.ucsd.edu/responses/Results.aspx?Name=%2C')

    # read in the dataset from the html file
    df = pd.read_html(driver.page_source)[0]
    print('Dataset parsed, closing browser window.')

    # destroy driver instance
    driver.quit()

    return df   

In [16]:
def get_raw_html():
    # launch browser using Selenium, need to have Firefox installed
    print('Opening a browser window...')
    s=Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(service=s)
    print('Browser window open, loading the page...')

    # get the page that lists all the data, first try
    driver.get('https://cape.ucsd.edu/responses/Results.aspx')
    print('Please enter credentials...')

    # wait until SSO credentials are entered
    wait = WebDriverWait(driver, 60)
    element = wait.until(expected_conditions.title_contains('Course And Professor Evaluations (CAPE)'))

    # get the page that lists all the data
    # (%2C is the comma, drops all the data since every professor name has it)
    driver.get('https://cape.ucsd.edu/responses/Results.aspx?Name=%2C')

    # creating soup object
    soup = BeautifulSoup(driver.page_source)
    print('Dataset parsed, closing browser window.')

    # destroy driver instance
    driver.quit()

    return soup

In [18]:
soup = BeautifulSoup(open("Course And Professor Evaluations (CAPE).html", "r"))

In [26]:
soup.find_all("a", {"href" : True, "id": re.compile("ContentPlaceHolder1_gvCAPEs_hlViewReport_.*")})[0].get("href")

'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052181'

In [29]:
def get_courseID(soup):
    bs4obj = soup.find_all("a", {"href" : True, "id": re.compile("ContentPlaceHolder1_gvCAPEs_hlViewReport_.*")})
    return_lst = []
    for query in bs4obj:
        return_lst.append(query.get("href"))
    return return_lst

In [30]:
get_courseID(soup)

['https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052181',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052184',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=068855',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052567',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052569',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052570',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=059666',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052571',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052572',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052573',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052574',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052575',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052576',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052577',
 'https://cape.ucsd.edu/responses/

In [13]:
df

Unnamed: 0,Instructor,Course,Term,Enroll,Evals Made,Rcmnd Class,Rcmnd Instr,Study Hrs/wk,Avg Grade Expected,Avg Grade Received
0,"Butler, Elizabeth Annette",AAS 10 - Intro/African-American Studies (A),FA21,67,40,94.9%,94.6%,4.00,B+ (3.67),B+ (3.54)
1,"Puritty, Chandler Elizabeth",AAS 190 - Sp Topics/Af-Am Studies (A),FA21,39,14,100.0%,100.0%,2.93,A- (3.86),A (4.00)
2,"Andrews, Abigail Leslie",AIP 197T - AIP: Special Programs (A),FA21,27,11,100.0%,100.0%,8.10,A (4.00),
3,"Smith, Neil Gordon",ANAR 104 - Intro GIS for Anth & Arch (A),FA21,14,8,100.0%,100.0%,6.50,A- (3.71),
4,"Smith, Neil Gordon",ANAR 121 - Cyber-Archaeology (A),FA21,13,8,100.0%,100.0%,5.25,B+ (3.63),
...,...,...,...,...,...,...,...,...,...,...
56026,"Li, Huai",VIS 105D - Aesthetics/Chinese Calligraphy (A),SU07,17,13,100.0%,100.0%,2.33,A (4.00),
56027,"Guerrero, Raul M.",VIS 106A - Painting: Image Making (A),SU07,16,14,92.9%,92.9%,7.21,A- (3.79),
56028,"Mangolte, Babette",VIS 194S - Fantasy In Film (A),SU07,80,57,74.1%,26.4%,4.27,B (3.21),
56029,"Holland, Nicole Murphy",VIS 22 - Formations of Modern Art (A),SU07,40,33,100.0%,96.7%,4.32,B+ (3.62),


In [14]:
df.to_csv("capes.csv")

In [30]:
df["Rcmnd Instr"].str.replace("%", "").astype("float").value_counts()

100.0    21397
80.0       860
75.0       836
83.3       821
87.5       794
         ...  
36.1         1
58.9         1
21.3         1
9.9          1
32.2         1
Name: Rcmnd Instr, Length: 786, dtype: int64