In [5]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions

In [14]:
def get_raw_cape_dataframe():
    
    # launch browser using Selenium, need to have Firefox installed
    print('Opening a browser window...')
    s=Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(service=s)
    print('Browser window open, loading the page...')

    # get the page that lists all the data, first try
    driver.get('https://cape.ucsd.edu/responses/Results.aspx')
    print('Please enter credentials...')

    # wait until SSO credentials are entered
    wait = WebDriverWait(driver, 60)
    element = wait.until(expected_conditions.title_contains('Course And Professor Evaluations (CAPE)'))

    # get the page that lists all the data
    # (%2C is the comma, drops all the data since every professor name has it)
    driver.get('https://cape.ucsd.edu/responses/Results.aspx?Name=%2C')

    # read in the dataset from the html file
    df = pd.read_html(driver.page_source)[0]
    print('Dataset parsed, closing browser window.')

    # destroy driver instance
    driver.quit()

    return df   

In [16]:
def get_raw_html():
    # launch browser using Selenium, need to have Firefox installed
    print('Opening a browser window...')
    s=Service(GeckoDriverManager().install())
    driver = webdriver.Firefox(service=s)
    print('Browser window open, loading the page...')

    # get the page that lists all the data, first try
    driver.get('https://cape.ucsd.edu/responses/Results.aspx')
    print('Please enter credentials...')

    # wait until SSO credentials are entered
    wait = WebDriverWait(driver, 60)
    element = wait.until(expected_conditions.title_contains('Course And Professor Evaluations (CAPE)'))

    # get the page that lists all the data
    # (%2C is the comma, drops all the data since every professor name has it)
    driver.get('https://cape.ucsd.edu/responses/Results.aspx?Name=%2C')

    # creating soup object
    soup = BeautifulSoup(driver.page_source)
    print('Dataset parsed, closing browser window.')

    # destroy driver instance
    driver.quit()

    return soup

In [6]:
soup = BeautifulSoup(open("Course And Professor Evaluations (CAPE).html", "r"))

In [7]:
soup.find_all("a", {"href" : True, "id": re.compile("ContentPlaceHolder1_gvCAPEs_hlViewReport_.*")})[0].get("href")

'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052181'

In [8]:
def get_courseID(soup):
    bs4obj = soup.find_all("a", {"href" : True, "id": re.compile("ContentPlaceHolder1_gvCAPEs_hlViewReport_.*")})
    return_lst = []
    for query in bs4obj:
        return_lst.append(query.get("href"))
    return return_lst

In [9]:
get_courseID(soup)

['https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052181',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052184',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=068855',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052567',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052569',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052570',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=059666',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052571',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052572',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052573',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052574',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052575',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052576',
 'https://cape.ucsd.edu/responses/CAPEReport.aspx?sectionid=052577',
 'https://cape.ucsd.edu/responses/

In [12]:
df = pd.read_html("Course And Professor Evaluations (CAPE).html")[0]

In [52]:
df["Avg GPA"] = df["Avg Grade Received"].str.extract("(\d{1}\.\d{2})").astype("float")

In [55]:
df["Avg GPA Expected"] = df["Avg Grade Expected"].str.extract("(\d{1}\.\d{2})").astype("float")

In [56]:
df[df["Avg GPA"] == min(df["Avg GPA"])]

Unnamed: 0,Instructor,Course,Term,Enroll,Evals Made,Rcmnd Class,Rcmnd Instr,Study Hrs/wk,Avg Grade Expected,Avg Grade Received,Avg GPA,Avg GPA Expected
10869,"Bowers, Adam R.",MATH 10B - Calculus II (A),S219,63,18,77.8%,83.3%,8.28,B- (2.83),D (1.21),1.21,2.83


In [57]:
df["Reality vs Expectation"] = abs(df["Avg GPA Expected"] - df["Avg GPA"])

In [62]:
df.sort_values("Reality vs Expectation", ascending=False).head(10)

Unnamed: 0,Instructor,Course,Term,Enroll,Evals Made,Rcmnd Class,Rcmnd Instr,Study Hrs/wk,Avg Grade Expected,Avg Grade Received,Avg GPA,Avg GPA Expected,Reality vs Expectation
47857,"Cole, Michael Scott",CAT 124 - Sixth College Practicum (C),FA09,22,20,80.0%,95.0%,5.3,A (4.00),C (2.00),2.0,4.0,2.0
15197,"Harel, Guershon",MATH 102 - Applied Linear Algebra (A),S218,76,11,100.0%,90.0%,9.95,B+ (3.40),C- (1.71),1.71,3.4,1.69
11324,"Eastin, Schuyler E.",AWP 2B - Analytical Writing B (0),SP19,25,23,65.2%,95.5%,6.76,B+ (3.67),C- (2.00),2.0,3.67,1.67
18118,"West, Summer Noel",AWP 2A - Analytical Writing A (0),FA17,51,32,73.3%,86.7%,7.97,A- (3.82),C (2.15),2.15,3.82,1.67
10869,"Bowers, Adam R.",MATH 10B - Calculus II (A),S219,63,18,77.8%,83.3%,8.28,B- (2.83),D (1.21),1.21,2.83,1.62
19628,"Lytle, Cecil William",MUS 8GS - American Music (A),S117,21,3,50.0%,50.0%,4.5,C (2.00),B+ (3.60),3.6,2.0,1.6
41348,"Ebrahimi-Fardooee, Mohammad Ali",MATH 3C - Precalculus (A),S211,22,7,85.7%,85.7%,12.5,B+ (3.50),C- (1.96),1.96,3.5,1.54
13301,"Harel, Guershon",MATH 18 - Linear Algebra (D),WI19,171,78,86.1%,69.4%,6.89,B (3.18),D+ (1.65),1.65,3.18,1.53
17853,"Loh, Kenneth J. H.",SE 130A - Structural Analysis I (A),WI18,22,17,93.8%,87.5%,9.75,B- (2.81),D (1.28),1.28,2.81,1.53
1196,"Shafir, Gershon",SOCI 168G - Populism: Then and Now (A),FA21,26,5,75.0%,75.0%,6.0,A (4.00),C+ (2.48),2.48,4.0,1.52


In [65]:
df[df["Course"].str.contains("COGS 107A")].sort_values("Reality vs Expectation", ascending=False)

Unnamed: 0,Instructor,Course,Term,Enroll,Evals Made,Rcmnd Class,Rcmnd Instr,Study Hrs/wk,Avg Grade Expected,Avg Grade Received,Avg GPA,Avg GPA Expected,Reality vs Expectation
27179,"Boyle, Mary E. T.",COGS 107A - Neuroanatomy and Physiology (A),S115,54,14,85.7%,92.9%,9.88,B (3.07),B+ (3.68),3.68,3.07,0.61
30686,"Boyle, Mary E. T.",COGS 107A - Neuroanatomy and Physiology (A),S114,25,3,100.0%,100.0%,7.83,B+ (3.67),B (3.11),3.11,3.67,0.56
54999,"Pineda, Jaime A",COGS 107A - Neuroanatomy and Physiology (A),FA07,150,91,92.0%,95.5%,5.26,B (3.28),B- (2.82),2.82,3.28,0.46
51550,"Pineda, Jaime A",COGS 107A - Neuroanatomy and Physiology (A),FA08,187,142,93.4%,95.6%,6.21,B+ (3.31),B- (2.86),2.86,3.31,0.45
26097,"Pineda, Jaime A",COGS 107A - Neuroanatomy and Physiology (A),FA15,217,139,92.6%,89.0%,5.55,B+ (3.42),B- (2.99),2.99,3.42,0.43
29651,"Pineda, Jaime A",COGS 107A - Neuroanatomy and Physiology (A),FA14,189,75,84.9%,87.7%,6.25,B (3.10),B- (2.76),2.76,3.1,0.34
40348,"Pineda, Jaime A",COGS 107A - Neuroanatomy and Physiology (A),FA11,233,101,94.1%,94.1%,6.8,B (3.29),B- (2.95),2.95,3.29,0.34
47946,"Pineda, Jaime A",COGS 107A - Neuroanatomy and Physiology (A),FA09,168,112,90.9%,89.3%,5.4,B (3.16),B- (2.83),2.83,3.16,0.33
44222,"Pineda, Jaime A",COGS 107A - Neuroanatomy and Physiology (A),FA10,206,164,90.2%,90.1%,6.76,B (3.10),B- (2.79),2.79,3.1,0.31
19506,"Boyle, Mary E. T.",COGS 107A - Neuroanatomy and Physiology (A),S117,57,48,87.5%,91.7%,11.01,B (3.23),B+ (3.53),3.53,3.23,0.3


In [14]:
df.to_csv("capes.csv")

In [30]:
df["Rcmnd Instr"].str.replace("%", "").astype("float").value_counts()

100.0    21397
80.0       860
75.0       836
83.3       821
87.5       794
         ...  
36.1         1
58.9         1
21.3         1
9.9          1
32.2         1
Name: Rcmnd Instr, Length: 786, dtype: int64