In [1]:
import pandas as pd
import numpy as np
import ast

Here is a list of questions I'd like to solve during this project.


1. Is there a trend behind the courses with little to no ratings?

2. What department seems to use uwflow the most?

3. Does the usaage of uwflow become less as you look into higher year courses? i.e 100 vs 200 and so on

4. Are users more likely to use uwflow to rate a professor/course they found liked/useful or is it the opposite?

5. What department thinks they have the most useful/hardest/liked courses based on the data?

6. Is there a correlation between the sentiment of courses and professors within the same faculty?

First things first, we'll have to clean up the data and transform it so it's usable. Most of the questions I want to answer are faculty based, so we'll focus on arranging the data in such a manner

In [2]:
course_df = pd.read_csv("./Course_data.csv")
prof_df = pd.read_csv("./Prof_data.csv")

In [3]:
prof_df.head()

Unnamed: 0,Professor_Name,Course,Liked_%,Professor_Reviews
0,Lori Michelle Case,CS 115,77%,"['clear prof', 'Lori is an awesome professor. ..."
1,Naomi Nishimura,CS 115,70%,"['Not really engaging but helpful.', 'Definite..."
2,Troy Vasiga,CS 115,96%,"['One of the best CS teacher, his lecture is v..."
3,Victoria Sakhnini,CS 115,85%,['Great prof. Explains concepts very well and ...
4,Sandy Graham,CS 115,74%,"['Clear in her explanations.', ""Her clicker so..."


In [4]:
course_df

Unnamed: 0,Course_Code,Course_Name,Number_of_Ratings,Number_of_Comments,Useful,Easy,Liked,Course_Reviews,Course_Enrollment
0,CS 115,Introduction to Computer Science 1,2206,,22%,11%,25%,"['A bird course, easy to get 90+, but it is us...","['87/90', '83/90', '88/90', '68/90', '88/90', ..."
1,MATH 135,Algebra for Honours Mathematics,1555,338 comments,84%,43%,78%,"['Very easy and interesting course, no concept...","['55/60', '54/60', '52/60', '59/60', '52/60', ..."
2,ECON 101,Introduction to Microeconomics,1398,264 comments,63%,70%,45%,['you can just google everything but its just ...,"['252/387', '364/387', '184/220', '189/220', '..."
3,PSYCH 101,Introductory Psychology,1084,229 comments,73%,66%,78%,"[""Super fun, engaging prof and the exams are l...","['418/446', '430/446', '212/285', '142/160', '..."
4,MATH 137,Calculus 1 for Honours Mathematics,1036,211 comments,84%,56%,68%,"['Easy course', 'The course itself is somewhat...","['92/120', '93/120', '111/120', '104/120', '92..."
...,...,...,...,...,...,...,...,...,...
8910,GEOG 662,Transforming Canadian Resource Management,0,0,0,0,0,['No reviews'],['no data']
8911,ERS 625,Qualitative Methods in Geography,0,0,0,0,0,['No reviews'],['no data']
8912,HLTH 725,Sociology of Health,0,0,0,0,0,['No reviews'],['no data']
8913,ME 655,Advanced Building Energy Analysis,0,0,0,0,0,['No reviews'],['no data']


We have 6 faculties in total. Health, Math, Arts, Engineering, Environment and Science. But how many unique course codes do we have?

In [5]:
codes = course_df['Course_Code'].copy()
codes = codes.apply(lambda x: x.split(" ")[0])
codes.nunique()

188

In [6]:
Keys = ["Math", "Engineering", "Science", "Health", "Environment", "Arts", "Coop"]

eng = ["AE", "ARCH", "BME", "CHE", "CIVE", "CE" , "ECE", "ENVE", "GEOE", "GENE", "MSE", "ME", "MTE", "NE", "SE", "SYDE", "NANO"]

math = ["AFM", "ACTSC", "AMATH", "CO", "COMM", "CS", "CFM", "ECON", "PHYS", "PMATH", "STAT", "MATBUS", "MATH", "MTHEL", "ACC", "BE", "CM"]

sci = ["AVIA", "BIOL", "CHEM", "COGSCI", "EARTH", "GERON", "HUMSC", "MNS", "PHYS", "PLAN", "PSCI", "SCI", "SCBUS", "SOC", "SFM", "MSCI"]

arts = ["ANTH", "APPLS", "ARABIC", "ARTS", "ARBUS", "BLKST", "BASE", "BET", "BUS", "CDNST", "CHINA", "CMW", "CLAS", "COMMST", "CROAT", "CI", "DAC", "DUTCH", "EASIA", "ENGL", "EMLS", "FINE", "FR",
        "GSJ", "GER", "GBDA", "GRK", "HIST", "HRM", "HRTS", "INDENT", "INDG", "INNOV", "INDEV", "INTST", "ITAL", "ITALST", "JAPAN", "JS", "INTEG", "KOREA", "LAT",
       "LS", "MGMT", "MEDVL", "MENN", "MOHAWK", "MUSIC", "PACS", "PHIL", "PORT", "PSYCH", "RS", "RUSS", "REES", "SMF", "SDS", "SWREN", "SOCWK", "STV", "SPAN", "SI", "THPERF",
       "AHS", "VCULT", "ESL", "WS", "ASL", "DRAMA"]

env = ["ENVS", "ENBUS", "ERS", "GEOG"]

coop = ["COOP", "PD", "PDARCH", "PDPHRM", "WKRPT", "SPCOM"]

health = ["HEALTH", "HHUM", "KIN", "OPTOM", "PHARM", "HLTH", "REC", "PHS"]

department = dict(zip(Keys, [math, eng, sci, health, env, arts, coop]))
length = len(eng) + len(math) + len(sci) + len(arts) + len(env) + len(coop) + len(health)
length

137

We're missing some course codes, let's see what they are

In [7]:
all_course_codes = eng + math + sci + arts + env + coop + health
missing_courses = []
for i in range(len(codes.unique())):
    code = codes.unique()[i]
    if code not in all_course_codes:
        missing_courses.append(code)
missing_courses    

['SEQ',
 'UNIV',
 'ACINTY',
 'ISS',
 'NATST',
 'SVENT',
 'KPE',
 'ADMGT',
 'ARCHL',
 'COMST',
 'CT',
 'DEI',
 'DM',
 'EFAS',
 'EVSY',
 'FILM',
 'GEMCC',
 'GGOV',
 'GLOBAL',
 'HSG',
 'INTTS',
 'IS',
 'LED',
 'MI',
 'NES',
 'PDENG',
 'POLSH',
 'PS',
 'QIC',
 'RELC',
 'SOCIN',
 'SPD',
 'SUSM',
 'SWK',
 'TAX',
 'TN',
 'TOUR',
 'TS',
 'UN',
 'WATER',
 'ELPE',
 'RSCH',
 'TPM',
 'AB',
 'UU',
 'ASTRN',
 'CULT',
 'WHMIS',
 'ECDEV',
 'DATSC',
 'FCIT',
 'WIL',
 'UCR']

Since these are not present in the latest list of courses in the undergraduate calendar, I will assume that they have been discontinued, and so they will not be added to the dictionary

In [8]:
for course in missing_courses:
    course_df = course_df[course_df["Course_Code"].str.contains(course)==False]
course_df

Unnamed: 0,Course_Code,Course_Name,Number_of_Ratings,Number_of_Comments,Useful,Easy,Liked,Course_Reviews,Course_Enrollment
0,CS 115,Introduction to Computer Science 1,2206,,22%,11%,25%,"['A bird course, easy to get 90+, but it is us...","['87/90', '83/90', '88/90', '68/90', '88/90', ..."
1,MATH 135,Algebra for Honours Mathematics,1555,338 comments,84%,43%,78%,"['Very easy and interesting course, no concept...","['55/60', '54/60', '52/60', '59/60', '52/60', ..."
2,ECON 101,Introduction to Microeconomics,1398,264 comments,63%,70%,45%,['you can just google everything but its just ...,"['252/387', '364/387', '184/220', '189/220', '..."
4,MATH 137,Calculus 1 for Honours Mathematics,1036,211 comments,84%,56%,68%,"['Easy course', 'The course itself is somewhat...","['92/120', '93/120', '111/120', '104/120', '92..."
5,PD 1,Career Fundamentals,1000,189 comments,19%,80%,7%,['The only effect of this course is to add pre...,"['1956/3500', '21/1000', '475/3500', '32/1000'..."
...,...,...,...,...,...,...,...,...,...
8910,GEOG 662,Transforming Canadian Resource Management,0,0,0,0,0,['No reviews'],['no data']
8911,ERS 625,Qualitative Methods in Geography,0,0,0,0,0,['No reviews'],['no data']
8912,HLTH 725,Sociology of Health,0,0,0,0,0,['No reviews'],['no data']
8913,ME 655,Advanced Building Energy Analysis,0,0,0,0,0,['No reviews'],['no data']


In [9]:
course_df[course_df.isnull().any(axis=1)]

Unnamed: 0,Course_Code,Course_Name,Number_of_Ratings,Number_of_Comments,Useful,Easy,Liked,Course_Reviews,Course_Enrollment
0,CS 115,Introduction to Computer Science 1,2206,,22%,11%,25%,"['A bird course, easy to get 90+, but it is us...","['87/90', '83/90', '88/90', '68/90', '88/90', ..."
383,SE 490,Design Project 1,55,,64%,74%,62%,"[""The purpose of the course is just to show yo...","['85/100', '32/40', '30/30']"
1257,BUS 352W,Introduction to Marketing Management (WLU),10,0 comments,,,40%,['No reviews'],['no data']
1275,ENVE 115,Linear Algebra,9,0 comments,,,22%,['No reviews'],"['80/85', '80/85', '80/85', '51/85', '26/42', ..."
1317,AE 104,Mechanics 1,9,0 comments,,,78%,['No reviews'],"['100/100', '52/50', '48/50', '100/100', '87/1..."
...,...,...,...,...,...,...,...,...,...
3303,ASL 201R,American Sign Language 3,1,0 comments,,,100%,['No reviews'],['9/25']
3310,PHARM 326,Institutional Pharmacy Practice,1,0 comments,,,0%,['No reviews'],['114/120']
3312,KIN 492,Exercise Management for Chronic Conditions,1,0 comments,,,100%,['No reviews'],['7/8']
3315,BE 660,Negotiations,1,0 comments,,,100%,['No reviews'],"['7/35', '0/70']"


After some inspection, the null values in the useful, easy and liked column is due to a different selector being used when the value is N/A, so we can fill those values with 0

In [10]:
course_df[['Useful', 'Easy', 'Liked']] = course_df[['Useful', 'Easy', 'Liked']].fillna('0%')
course_df.reset_index(drop=True, inplace=True)
course_df[course_df.isnull().any(axis=1)]

Unnamed: 0,Course_Code,Course_Name,Number_of_Ratings,Number_of_Comments,Useful,Easy,Liked,Course_Reviews,Course_Enrollment
0,CS 115,Introduction to Computer Science 1,2206,,22%,11%,25%,"['A bird course, easy to get 90+, but it is us...","['87/90', '83/90', '88/90', '68/90', '88/90', ..."
366,SE 490,Design Project 1,55,,64%,74%,62%,"[""The purpose of the course is just to show yo...","['85/100', '32/40', '30/30']"


Now we can fix the number of comments by replacing it with the length of the list in course_reviews. Since it was saved as a csv, we have to use ast to convert it back to a list.

In [11]:
course_df['Course_Reviews'] = course_df['Course_Reviews'].apply(ast.literal_eval)
course_df['Course_Enrollment'] = course_df['Course_Enrollment'].apply(ast.literal_eval)

course_df.loc[course_df['Course_Code'] == 'CS 115', 'Number_of_Comments'] = len(course_df.loc[course_df['Course_Code'] == 'CS 115', 'Course_Reviews'].values[0])
course_df.loc[course_df['Course_Code'] == 'SE 490', 'Number_of_Comments'] = len(course_df.loc[course_df['Course_Code'] == 'SE 490', 'Course_Reviews'].values[0])
course_df.iloc[[0,366]]

Unnamed: 0,Course_Code,Course_Name,Number_of_Ratings,Number_of_Comments,Useful,Easy,Liked,Course_Reviews,Course_Enrollment
0,CS 115,Introduction to Computer Science 1,2206,114,22%,11%,25%,"[A bird course, easy to get 90+, but it is use...","[87/90, 83/90, 88/90, 68/90, 88/90, 56/84, 88/..."
366,SE 490,Design Project 1,55,7,64%,74%,62%,[The purpose of the course is just to show you...,"[85/100, 32/40, 30/30]"


Replace the course enrollments with the number of students that have taken the course, we don't really need the course cap.

In [12]:
def add_enrollments(list):
    students = 0
    for lecture in list:
        if lecture != 'no data':
            students += int(lecture.split("/")[0])
    return students

course_df["Course_Enrollment"] = course_df["Course_Enrollment"].apply(add_enrollments)
course_df

Unnamed: 0,Course_Code,Course_Name,Number_of_Ratings,Number_of_Comments,Useful,Easy,Liked,Course_Reviews,Course_Enrollment
0,CS 115,Introduction to Computer Science 1,2206,114,22%,11%,25%,"[A bird course, easy to get 90+, but it is use...",4359
1,MATH 135,Algebra for Honours Mathematics,1555,338 comments,84%,43%,78%,"[Very easy and interesting course, no concepts...",7597
2,ECON 101,Introduction to Microeconomics,1398,264 comments,63%,70%,45%,[you can just google everything but its just f...,6247
3,MATH 137,Calculus 1 for Honours Mathematics,1036,211 comments,84%,56%,68%,"[Easy course, The course itself is somewhat ea...",8237
4,PD 1,Career Fundamentals,1000,189 comments,19%,80%,7%,[The only effect of this course is to add pres...,5790
...,...,...,...,...,...,...,...,...,...
7437,GEOG 662,Transforming Canadian Resource Management,0,0,0,0,0,[No reviews],0
7438,ERS 625,Qualitative Methods in Geography,0,0,0,0,0,[No reviews],0
7439,HLTH 725,Sociology of Health,0,0,0,0,0,[No reviews],0
7440,ME 655,Advanced Building Energy Analysis,0,0,0,0,0,[No reviews],0


Now we replace the percentages with their actual values

In [13]:
course_df['Useful'] = course_df['Useful'].apply(lambda x: x.replace('%', ''))
course_df['Easy'] = course_df['Easy'].apply(lambda x: x.replace('%', ''))
course_df['Liked'] = course_df['Liked'].apply(lambda x: x.replace('%', ''))

for index, row in course_df.iterrows():
    course_df.loc[index, 'Useful'] = round((int(row['Useful'])/100) * int(row['Number_of_Ratings']))
    course_df.loc[index, 'Easy'] = round((int(row['Easy'])/100) * int(row['Number_of_Ratings']))
    course_df.loc[index, 'Liked'] = round((int(row['Liked'])/100) * int(row['Number_of_Ratings']))
     
course_df

Unnamed: 0,Course_Code,Course_Name,Number_of_Ratings,Number_of_Comments,Useful,Easy,Liked,Course_Reviews,Course_Enrollment
0,CS 115,Introduction to Computer Science 1,2206,114,485,243,552,"[A bird course, easy to get 90+, but it is use...",4359
1,MATH 135,Algebra for Honours Mathematics,1555,338 comments,1306,669,1213,"[Very easy and interesting course, no concepts...",7597
2,ECON 101,Introduction to Microeconomics,1398,264 comments,881,979,629,[you can just google everything but its just f...,6247
3,MATH 137,Calculus 1 for Honours Mathematics,1036,211 comments,870,580,704,"[Easy course, The course itself is somewhat ea...",8237
4,PD 1,Career Fundamentals,1000,189 comments,190,800,70,[The only effect of this course is to add pres...,5790
...,...,...,...,...,...,...,...,...,...
7437,GEOG 662,Transforming Canadian Resource Management,0,0,0,0,0,[No reviews],0
7438,ERS 625,Qualitative Methods in Geography,0,0,0,0,0,[No reviews],0
7439,HLTH 725,Sociology of Health,0,0,0,0,0,[No reviews],0
7440,ME 655,Advanced Building Energy Analysis,0,0,0,0,0,[No reviews],0


In [14]:
course_df['Number_of_Comments'] = course_df['Number_of_Comments'].apply(lambda x: str(x).replace(' comments', ''))
course_df

Unnamed: 0,Course_Code,Course_Name,Number_of_Ratings,Number_of_Comments,Useful,Easy,Liked,Course_Reviews,Course_Enrollment
0,CS 115,Introduction to Computer Science 1,2206,114,485,243,552,"[A bird course, easy to get 90+, but it is use...",4359
1,MATH 135,Algebra for Honours Mathematics,1555,338,1306,669,1213,"[Very easy and interesting course, no concepts...",7597
2,ECON 101,Introduction to Microeconomics,1398,264,881,979,629,[you can just google everything but its just f...,6247
3,MATH 137,Calculus 1 for Honours Mathematics,1036,211,870,580,704,"[Easy course, The course itself is somewhat ea...",8237
4,PD 1,Career Fundamentals,1000,189,190,800,70,[The only effect of this course is to add pres...,5790
...,...,...,...,...,...,...,...,...,...
7437,GEOG 662,Transforming Canadian Resource Management,0,0,0,0,0,[No reviews],0
7438,ERS 625,Qualitative Methods in Geography,0,0,0,0,0,[No reviews],0
7439,HLTH 725,Sociology of Health,0,0,0,0,0,[No reviews],0
7440,ME 655,Advanced Building Energy Analysis,0,0,0,0,0,[No reviews],0


Seems clean enough, let's move onto the professor table

In [15]:
prof_df.head()

Unnamed: 0,Professor_Name,Course,Liked_%,Professor_Reviews
0,Lori Michelle Case,CS 115,77%,"['clear prof', 'Lori is an awesome professor. ..."
1,Naomi Nishimura,CS 115,70%,"['Not really engaging but helpful.', 'Definite..."
2,Troy Vasiga,CS 115,96%,"['One of the best CS teacher, his lecture is v..."
3,Victoria Sakhnini,CS 115,85%,['Great prof. Explains concepts very well and ...
4,Sandy Graham,CS 115,74%,"['Clear in her explanations.', ""Her clicker so..."


In [16]:
prof_df.shape

(9100, 4)

In [17]:
prof_df['Professor_Reviews'] = prof_df['Professor_Reviews'].apply(ast.literal_eval)
prof_df['Professor_Name'].nunique()

3575

In [18]:
# profs = pd.DataFrame(prof_df['Professor_Name'].unique().copy(), columns=['Professor_Name'])
# profs['Courses_Taught'] = pd.Series(dtype=object)
# profs['Professor_Reviews'] = pd.Series(dtype=object)
# profs['Liked'] = pd.Series(dtype=str)

# for index, prof in profs.iterrows():
#     course_list = []
#     reviews_list = []
#     name = profs.loc[index, 'Professor_Name']
    
#     for i in range(len(prof_df.loc[prof_df['Professor_Name'] == name])):
#         course = prof_df.loc[prof_df['Professor_Name'] == name, 'Course'].values[i]
#         reviews = prof_df.loc[prof_df['Professor_Name'] == name, 'Professor_Reviews'].values[i]
#         reviews_list.append(reviews)
#         course_list.append(course)
        
#     profs.loc[index, 'Courses_Taught'] = course_list
#     profs.loc[index, 'Professor_Reviews'] = reviews_list
#     profs.loc[index, 'Liked'] = prof_df.loc[prof_df['Professor_Name'] == name, 'Liked_%'].values[0]
# profs

Above code works, but it's a brute force method and takes longer than I'd want it to. Below is an implementation of the same functionality but using .groupby()

In [19]:
new_prof_df = prof_df.groupby('Professor_Name').agg(
{
    'Course':lambda x: list(x),
    'Professor_Reviews':lambda x: list(x),
    'Liked_%': 'first'
}).reset_index().copy()

new_prof_df = new_prof_df.rename(columns={'Professor_Name':'Professor', 'Course':'Courses_Taught'})
# Ok wow this is way faster lol

In [20]:
new_prof_df['Clear'] = pd.Series(dtype=str)
new_prof_df['Engaging']=pd.Series(dtype=str)
new_prof_df['Number_of_Comments']=pd.Series(dtype=str)
new_prof_df['Number_of_Ratings']= pd.Series(dtype=str)

So I noticed some data missing such as number of ratings and some more that might be useful. Will be adding that in using selenium

In [21]:
new_prof_df[new_prof_df.isnull().any(axis=1)]

Unnamed: 0,Professor,Courses_Taught,Professor_Reviews,Liked_%,Clear,Engaging,Number_of_Comments,Number_of_Ratings
0,Aakar Gupta,[CS 230],"[[TA was more clear and engaging, Doesn't real...",20%,,,,
1,Aaron Daniel Ettinger,[PSCI 387],[[He knows his material and is not a strict pr...,91%,,,,
2,Aaron Hutchinson,[MATH 115],"[[There aren't any lectures this term, so I ca...",90%,,,,
3,Aaron Kay,"[PSYCH 253, PSYCH 395]",[[By far the best prof ive ever had. He is a g...,,,,,
4,Aaron Smith,"[MATH 115, MATH 211, PMATH 467]",[[I believe our class was the first class he t...,80%,,,,
...,...,...,...,...,...,...,...,...
3570,rebecca shantz,[MUSIC 270],[[She clearly outlines expectations before ass...,100%,,,,
3571,shahla Aliakbari,"[AMATH 250, AMATH 351]",[[Assume everyone knows basic physics rules an...,47%,,,,
3572,shalah aliakbari,[ECE 205],[[The professor makes very little effort in en...,31%,,,,
3573,tsen,[CHEM 120],[[Very good prof considering I think he's new ...,100%,,,,


The new data we get from selenium will probably replace some of these null values, will see if any further refining will be needed after obtaining the data

In [22]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import re, string
from Helper_functions import *
import itertools

options = webdriver.ChromeOptions()
options.binary_location = './chrome-win64/chrome.exe'
driver = webdriver.Chrome(options=options)
pattern = re.compile('[^a-zA-Z]')

In [23]:
def generate_urls(name):
    chars = list('abcdefghijklmnopqrstuvwxyz_')
    name = name.lower()
    # print(name)
    special_char = [char for char in name if char not in chars]
    # print(special_char)
    
    combinations = []
    for i in range(len(special_char) + 1):
        combinations.extend(itertools.combinations(special_char, i))
    
    combos = [list(sublist) for sublist in combinations]
    # print(combos)
    
    urls = []
    for combo in combos:
        urls.append(''.join([char for char in name if char not in combo]))
    # print(urls)
    if "-" in name:
        urls.append(name.replace("-", "_"))
    return urls

In [None]:
for index, name in enumerate(new_prof_df['Professor']):
    # if index == 5:
    #     break
    try:
        _name = name.replace(" ", "_")
        #_name = pattern.sub('', _name)
        #_name = "Amit_M._Mehta"
        url = f'https://uwflow.com/professor/{_name}'
        driver.get(url)
        # weird looking xpath to make it case insensitive
        wait_for_element(driver, f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{name.lower().split(' ')[0]}') or contains(text(), \"That professor doesn't exist!\")]")

        if driver.find_elements(By.XPATH, "//*[contains(text(), \"That professor doesn't exist!\")]"):
            name_reversed_list = name.split(" ")
            name_reversed_list.reverse()
            reverse_name = ""
            for i, name in enumerate(name_reversed_list):
                reverse_name += name
                if i != len(name_reversed_list) - 1:
                    reverse_name += " "

            possible_urls = generate_urls(_name)
            possible_urls.append(pattern.sub('_', name))
            possible_urls.extend(generate_urls(reverse_name.replace(" ", "_")))
            possible_urls.append(pattern.sub('_', reverse_name))
            # print(name)
            for combination in possible_urls:
                url = f'https://uwflow.com/professor/{combination}'
                # print(url)
                # print(index)
                driver.get(url)
                wait_for_element(driver, f"//*[contains(text(), '{name}') or contains(text(), \"That professor doesn't exist!\")]")

                if not driver.find_elements(By.XPATH, "//*[contains(text(), \"That professor doesn't exist!\")]"):
                    break

        wait_for_element(driver, "(//div[contains(text(), 'Clear')]//following::div[contains(text(), '%') or contains(text(), 'N/A')][1])[1]")
        clear = driver.find_elements(By.XPATH, "(//div[contains(text(), 'Clear')]//following::div[contains(text(), '%') or contains(text(), 'N/A')][1])[1]")[0].text
        engaging = driver.find_elements(By.XPATH, "(//div[contains(text(), 'Engaging')]//following::div[contains(text(), '%') or contains(text(), 'N/A')][1])[1]")[0].text
        liked = driver.find_elements(By.XPATH, "(//div[contains(text(), 'liked')]//preceding::div[contains(text(), '%') or contains(text(), 'N/A')][1])[1]")[0].text

        wait_for_element(driver, "//a[contains(normalize-space(),'comments') or contains(normalize-space(), comment)]")
        if driver.find_elements(By.XPATH, "//a[contains(normalize-space(),'comments') or contains(normalize-space(), 'comment')]"):
            number_of_comments = driver.find_elements(By.XPATH, "//a[contains(normalize-space(),'comments') or contains(normalize-space(), 'comment')]")[0].text
        else:
            number_of_comments = 0
        wait_for_element(driver, "(//a[contains(normalize-space(),'comments') or contains(normalize-space(), 'comment')]//following::*[1])[1]")
        if driver.find_elements(By.XPATH, "(//a[contains(normalize-space(),'comments') or contains(normalize-space(), 'comment')]//following::*[1])[1]"):
            number_of_ratings = driver.find_elements(By.XPATH, "(//a[contains(normalize-space(),'comments') or contains(normalize-space(), 'comment')]//following::*[1])[1]")[0].text
        else:
            number_of_ratings = 0

        new_prof_df.loc[index, 'Clear'] = clear
        new_prof_df.loc[index, 'Engaging'] = engaging
        new_prof_df.loc[index, 'liked'] = liked
        new_prof_df.loc[index, 'Number_of_Ratings'] = number_of_ratings
        new_prof_df.loc[index, 'Number_of_Comments'] = number_of_comments

        time.sleep(.5)
    except Exception as e:
        # print(e)
        continue

driver.quit()
new_prof_df

In [29]:
new_prof_df.to_csv("cleaned_prof_data.csv", index=False)

In [33]:
course_df['Department'] = pd.Series(dtype=str)

def generate_department(row, department_dict):
    course_code = row["Course_Code"].split(" ")[0]
    
    for key in department_dict.keys():
        value = department_dict[f"{key}"]
        if course_code in value:
            return key
    return 'empty'

course_df['Department'] = course_df['Department'].apply(generate_department, department_dict=department, axis=1)
course_df

Unnamed: 0,Course_Code,Course_Name,Number_of_Ratings,Number_of_Comments,Useful,Easy,Liked,Course_Reviews,Course_Enrollment,Department
0,CS 115,Introduction to Computer Science 1,2206,114,485,243,552,"[A bird course, easy to get 90+, but it is use...",4359,Math
1,MATH 135,Algebra for Honours Mathematics,1555,338,1306,669,1213,"[Very easy and interesting course, no concepts...",7597,Math
2,ECON 101,Introduction to Microeconomics,1398,264,881,979,629,[you can just google everything but its just f...,6247,Math
3,MATH 137,Calculus 1 for Honours Mathematics,1036,211,870,580,704,"[Easy course, The course itself is somewhat ea...",8237,Math
4,PD 1,Career Fundamentals,1000,189,190,800,70,[The only effect of this course is to add pres...,5790,Coop
...,...,...,...,...,...,...,...,...,...,...
7437,GEOG 662,Transforming Canadian Resource Management,0,0,0,0,0,[No reviews],0,Environment
7438,ERS 625,Qualitative Methods in Geography,0,0,0,0,0,[No reviews],0,Environment
7439,HLTH 725,Sociology of Health,0,0,0,0,0,[No reviews],0,Health
7440,ME 655,Advanced Building Energy Analysis,0,0,0,0,0,[No reviews],0,"Engineering,"


In [36]:
new_prof_df.head()

Unnamed: 0,Professor,Courses_Taught,Professor_Reviews,Liked_%,Clear,Engaging,Number_of_Comments,Number_of_Ratings,liked
0,Aakar Gupta,[CS 230],"[[TA was more clear and engaging, Doesn't real...",20%,50%,0%,2 comments,2 ratings,20%
1,Aaron Daniel Ettinger,[PSCI 387],[[He knows his material and is not a strict pr...,91%,,,,,
2,Aaron Hutchinson,[MATH 115],"[[There aren't any lectures this term, so I ca...",90%,100%,75%,1 comment,2 ratings,90%
3,Aaron Kay,"[PSYCH 253, PSYCH 395]",[[By far the best prof ive ever had. He is a g...,,,,9 comments,0 ratings,
4,Aaron Smith,"[MATH 115, MATH 211, PMATH 467]",[[I believe our class was the first class he t...,80%,67%,100%,4 comments,3 ratings,80%


In [38]:
def generate_departments_prof(row, departments_dict):
    course_codes = set([course.split(" ")[0] for course in row['Courses_Taught']])
    
    departments = [dept for dept, codes in departments_dict.items() if course_codes & set(codes)]
    return set(departments)

new_prof_df['Department'] = new_prof_df.apply(generate_departments_prof, departments_dict=department, axis=1)
new_prof_df

Unnamed: 0,Professor,Courses_Taught,Professor_Reviews,Liked_%,Clear,Engaging,Number_of_Comments,Number_of_Ratings,liked,Department
0,Aakar Gupta,[CS 230],"[[TA was more clear and engaging, Doesn't real...",20%,50%,0%,2 comments,2 ratings,20%,{Math}
1,Aaron Daniel Ettinger,[PSCI 387],[[He knows his material and is not a strict pr...,91%,,,,,,{Science}
2,Aaron Hutchinson,[MATH 115],"[[There aren't any lectures this term, so I ca...",90%,100%,75%,1 comment,2 ratings,90%,{Math}
3,Aaron Kay,"[PSYCH 253, PSYCH 395]",[[By far the best prof ive ever had. He is a g...,,,,9 comments,0 ratings,,{Arts}
4,Aaron Smith,"[MATH 115, MATH 211, PMATH 467]",[[I believe our class was the first class he t...,80%,67%,100%,4 comments,3 ratings,80%,{Math}
...,...,...,...,...,...,...,...,...,...,...
3570,rebecca shantz,[MUSIC 270],[[She clearly outlines expectations before ass...,100%,100%,100%,2 comments,3 ratings,100%,{Arts}
3571,shahla Aliakbari,"[AMATH 250, AMATH 351]",[[Assume everyone knows basic physics rules an...,47%,55%,50%,6 comments,11 ratings,47%,{Math}
3572,shalah aliakbari,[ECE 205],[[The professor makes very little effort in en...,31%,43%,0%,1 comment,7 ratings,31%,"{Engineering,}"
3573,tsen,[CHEM 120],[[Very good prof considering I think he's new ...,100%,100%,100%,2 comments,3 ratings,100%,{Science}
