In [1]:
# scrapes faculty directory for prof names

import requests
from bs4 import BeautifulSoup
import pandas as pd


faculty_name = []

# manually got the max num of pages in directory
for i in range(1, 34):
    URL = 'https://faculty-directory.dartmouth.edu/department?dept=All&page=' + str(i)

    page = requests.get(URL)
    soup = BeautifulSoup(page.content, "html.parser")

    result = soup.find_all('span', 'class' == 'field-content')
    faculty_name += [r.text for r in result if r.text != '']

In [2]:
# df of all faculty at Dartmouth
faculty_name = set(faculty_name)
prof_df = pd.DataFrame(faculty_name, columns=['ProfessorName'])
prof_df

Unnamed: 0,ProfessorName
0,Margaretha Kramer-Hajos
1,Sabrina Joan Billings
2,Prasad Jayanti
3,Hsien-Chih Chang
4,James E. Dobson
...,...
811,Sandra S. Mefoude Obiono
812,G. Eric Schaller
813,Thomas P. Jack
814,James Goodwin Rice


In [3]:
# reads in course reviews for professor names
import pandas as pd
course_review_db = pd.read_csv('../data/course_review_db.csv', index_col = 0)
course_review_db

Unnamed: 0,Course,Department,Term,Prof,Review
0,AAAS007: First Year Seminar,AAAS,08W,Lindsay F.,"Never having studied African history, the mate..."
1,AAAS007: First Year Seminar,AAAS,08W,Lindsay F.,I personally am not that interested by the top...
2,AAAS007: First Year Seminar,AAAS,08W,Lindsay F.,the professor was uninspiring and was easily i...
3,AAAS007: First Year Seminar,AAAS,07W,Ayo Coly,The pace of this course is a little hectic. It...
4,AAAS007: First Year Seminar,AAAS,07W,Ayo Coly,This class definitely had a lot of reading to ...
...,...,...,...,...,...
56922,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia Monroe,I enjoyed the course and learned a lot about h...
56923,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia Monroe,I think of and read history in a new way now. ...
56924,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia Monroe,It made me like studying history a bit more.
56925,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia Monroe,It made me more optimistic. I found the Profes...


In [4]:
# Sample professor who has multiple names
course_review_db[course_review_db['Prof'] == 'Sebastiaan Joosten']

Unnamed: 0,Course,Department,Term,Prof,Review
6267,COSC030: Discrete Mathematics in Computer Science,COSC,21W,Sebastiaan Joosten,(Reminder for future readers: this term was du...
6274,COSC030: Discrete Mathematics in Computer Science,COSC,20X,Sebastiaan Joosten,"This class is normally bad, but Joosten made i..."
6275,COSC030: Discrete Mathematics in Computer Science,COSC,20X,Sebastiaan Joosten,I would definitely not recommend taking this c...
6281,COSC030: Discrete Mathematics in Computer Science,COSC,20S,Sebastiaan Joosten,Quality of teaching improved dramatically as t...
6562,COSC069.14: Functional Programming in Haskell,COSC.,20W,Sebastiaan Joosten,Joosten is not the best at explaining concepts...


In [5]:
course_review_db[course_review_db['Prof'] == 'Sebastiaan Joze Joosten']

Unnamed: 0,Course,Department,Term,Prof,Review
6535,COSC060: Computer Networks,COSC,21F,Sebastiaan Joze Joosten,Honestly pretty good class. Really cool and re...
6536,COSC060: Computer Networks,COSC,21F,Sebastiaan Joze Joosten,Very lecture heavy. Some lectures were made by...
6537,COSC060: Computer Networks,COSC,21F,Sebastiaan Joze Joosten,This class was all over the map. Joosten isn't...
20717,COSC 030: Discrete Math Computer Sci,COSC,20X,Sebastiaan Joze Joosten,"2 Midterms + Final Exam Drills due M, W, F Wee..."
20718,COSC 030: Discrete Math Computer Sci,COSC,20X,Sebastiaan Joze Joosten,A little too much work with daily drills and all
...,...,...,...,...,...
22385,COSC 98.02: Sr. Design&Impl Project II,COSC,21S,Sebastiaan Joze Joosten,Learned a lot about Flask.
22386,COSC 98.02: Sr. Design&Impl Project II,COSC,21S,Sebastiaan Joze Joosten,Loved this class! we made a project that we ar...
22387,COSC 98.02: Sr. Design&Impl Project II,COSC,21S,Sebastiaan Joze Joosten,This really was a culminating experience! I go...
22388,COSC 98.02: Sr. Design&Impl Project II,COSC,21S,Sebastiaan Joze Joosten,it was the culminating course for my major


In [6]:
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np

# sets up linking column between directory and review dataframe
dataframecolumn = pd.DataFrame(prof_df['ProfessorName'])
dataframecolumn.columns = ['directory_name']
compare = pd.DataFrame(set(course_review_db['Prof']))
compare.columns = ['fuzzy_match_name']

# merges names into combined dataframe
dataframecolumn['Key'] = 1
compare['Key'] = 1
combined_dataframe = dataframecolumn.merge(compare,on="Key",how="left")
combined_dataframe = combined_dataframe[~(combined_dataframe.directory_name==combined_dataframe.fuzzy_match_name)]

def partial_match(x,y):
    return(fuzz.ratio(x,y))
partial_match_vector = np.vectorize(partial_match)

# looks for names that are likely to match on directory name, ignore exact match
combined_dataframe['score']=partial_match_vector(combined_dataframe['directory_name'],combined_dataframe['fuzzy_match_name'])
combined_dataframe = combined_dataframe[combined_dataframe.score>=85]
combined_dataframe = combined_dataframe[combined_dataframe.score!=100]
combined_dataframe



Unnamed: 0,directory_name,Key,fuzzy_match_name,score
467,Margaretha Kramer-Hajos,1,Margaretha Kramer,85
2070,Sabrina Joan Billings,1,Sabrina Billings,86
6759,Hsien-Chih Chang,1,Hsien-chih Chang,94
8748,James E. Dobson,1,James Dobson,89
11017,Miles P. Blencowe,1,Miles Blencowe,90
...,...,...,...,...
1424015,Nurit Ben-Yehuda,1,Nurit Ben-yehuda,91
1431102,Hussein N. Kadhim,1,Hussein Kadhim,90
1433488,Sandra S. Mefoude Obiono,1,Sandra Mefoude Obiono,93
1438318,Thomas P. Jack,1,Thomas Jack,88


In [7]:
# sample name that fuzzy matches to two different names with same directory name
combined_dataframe[combined_dataframe['directory_name'] == 'Sebastiaan J. C. Joosten'].sort_values('score')

Unnamed: 0,directory_name,Key,fuzzy_match_name,score
645253,Sebastiaan J. C. Joosten,1,Sebastiaan Joze Joosten,85
646285,Sebastiaan J. C. Joosten,1,Sebastiaan Joosten,86


In [8]:
# adjusts professor name based on fuzzy matching results
def fuzzmatcher(row):
    if combined_dataframe['fuzzy_match_name'].str.startswith(row['Prof']).any():
        return list(combined_dataframe[combined_dataframe['fuzzy_match_name'].str.startswith(row['Prof'])]['directory_name'])[0]
    else:
        return row['Prof']
    
course_review_db['Prof'] = course_review_db.apply(fuzzmatcher, axis = 1)
course_review_db

Unnamed: 0,Course,Department,Term,Prof,Review
0,AAAS007: First Year Seminar,AAAS,08W,Lindsay F.,"Never having studied African history, the mate..."
1,AAAS007: First Year Seminar,AAAS,08W,Lindsay F.,I personally am not that interested by the top...
2,AAAS007: First Year Seminar,AAAS,08W,Lindsay F.,the professor was uninspiring and was easily i...
3,AAAS007: First Year Seminar,AAAS,07W,Ayo Coly,The pace of this course is a little hectic. It...
4,AAAS007: First Year Seminar,AAAS,07W,Ayo Coly,This class definitely had a lot of reading to ...
...,...,...,...,...,...
56922,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I enjoyed the course and learned a lot about h...
56923,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I think of and read history in a new way now. ...
56924,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me like studying history a bit more.
56925,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me more optimistic. I found the Profes...


In [9]:
# newly merged professor name
course_review_db[course_review_db['Prof'] == 'Sebastiaan J. C. Joosten']

Unnamed: 0,Course,Department,Term,Prof,Review
6267,COSC030: Discrete Mathematics in Computer Science,COSC,21W,Sebastiaan J. C. Joosten,(Reminder for future readers: this term was du...
6274,COSC030: Discrete Mathematics in Computer Science,COSC,20X,Sebastiaan J. C. Joosten,"This class is normally bad, but Joosten made i..."
6275,COSC030: Discrete Mathematics in Computer Science,COSC,20X,Sebastiaan J. C. Joosten,I would definitely not recommend taking this c...
6281,COSC030: Discrete Mathematics in Computer Science,COSC,20S,Sebastiaan J. C. Joosten,Quality of teaching improved dramatically as t...
6535,COSC060: Computer Networks,COSC,21F,Sebastiaan J. C. Joosten,Honestly pretty good class. Really cool and re...
...,...,...,...,...,...
22385,COSC 98.02: Sr. Design&Impl Project II,COSC,21S,Sebastiaan J. C. Joosten,Learned a lot about Flask.
22386,COSC 98.02: Sr. Design&Impl Project II,COSC,21S,Sebastiaan J. C. Joosten,Loved this class! we made a project that we ar...
22387,COSC 98.02: Sr. Design&Impl Project II,COSC,21S,Sebastiaan J. C. Joosten,This really was a culminating experience! I go...
22388,COSC 98.02: Sr. Design&Impl Project II,COSC,21S,Sebastiaan J. C. Joosten,it was the culminating course for my major


In [10]:
prof_df.to_csv('../data/professor_directory.csv')

In [11]:
# Example of mismatch course number/name
course_review_db[course_review_db['Course'].str.contains('Oceania')]

Unnamed: 0,Course,Department,Term,Prof,Review
1213,ANTH050.34: Peoples of Oceania,ANTH.,21W,Meredith Ferguson,Take this class!!! Professor Ferguson is the m...
1214,ANTH050.34: Peoples of Oceania,ANTH.,21F,Meredith Ferguson,The easiest class I have took at Dartmouth. Ba...
1215,ANTH050.34: Peoples of Oceania,ANTH.,21F,Meredith Ferguson,By far the easiest class I have ever taken. I ...
1216,ANTH050.34: Peoples of Oceania,ANTH.,21F,Meredith Ferguson,The least work I've ever had for a Dartmouth c...
1217,ANTH050.34: Peoples of Oceania,ANTH.,20W,Meredith Ferguson,10/10 would recommend this class. Professor Fe...
...,...,...,...,...,...
2557,ANTH 50.34: Peoples of Oceania,ANTH,21W,Meredith Ferguson,Made me want to take more classes with her
2558,ANTH 50.34: Peoples of Oceania,ANTH,21W,Meredith Ferguson,Reaffirmed my desire to be an anthro major and...
2559,ANTH 50.34: Peoples of Oceania,ANTH,21W,Meredith Ferguson,This was the first anthropology class I've tak...
2560,ANTH 50.34: Peoples of Oceania,ANTH,21W,Meredith Ferguson,Was able to branch out into new subjects


In [12]:
course_db = pd.read_csv('../data/course_db.csv')
course_db

Unnamed: 0,Course,Distribs,Quality,Reviews
0,AAAS007: First Year Seminar,,1.0,8.0
1,AAAS009: Introduction to AAAS Diaspora Studies,"CI, INT",1.0,2.0
2,AAAS010: Introduction to African-American Studies,"SOC, CI",6.0,14.0
3,AAAS011: Introduction to African Studies,"SOC, NW",7.0,9.0
4,AAAS012: Race and Slavery in U.S. History,"SOC, W",6.0,9.0
...,...,...,...,...
1800,WRIT007.33: Controv Sci and Pub Opinion,SOC,0.0,1.0
1801,WRIT007: First-year Seminar in Writing,,4.0,10.0
1802,WRIT008: Writing with Media,ART,1.0,5.0
1803,WRIT042: The Art of Science Writing,,1.0,1.0


In [13]:
from fuzzywuzzy import fuzz
import pandas as pd
import numpy as np

# sets up linking column between directory and review dataframe
dataframecolumn = pd.DataFrame(course_db['Course'])
dataframecolumn.columns = ['directory_name']
compare = pd.DataFrame(set(course_review_db['Course']))
compare.columns = ['fuzzy_match_name']

# merges names into combined dataframe
dataframecolumn['Key'] = 1
compare['Key'] = 1
combined_dataframe = dataframecolumn.merge(compare,on="Key",how="left")
combined_dataframe = combined_dataframe[~(combined_dataframe.directory_name==combined_dataframe.fuzzy_match_name)]

def partial_match(x,y):
    return(fuzz.ratio(x,y))
partial_match_vector = np.vectorize(partial_match)

# looks for names that are likely to match on directory name, ignore exact match
combined_dataframe['score']=partial_match_vector(combined_dataframe['directory_name'],combined_dataframe['fuzzy_match_name'])
combined_dataframe = combined_dataframe[combined_dataframe.score>=85]
combined_dataframe = combined_dataframe[combined_dataframe.score!=100]
combined_dataframe

Unnamed: 0,directory_name,Key,fuzzy_match_name,score
2195,AAAS007: First Year Seminar,1,MUS007: First-Year Seminar,87
5812,AAAS010: Introduction to African-American Studies,1,AAAS011: Introduction to African Studies,88
9832,AAAS011: Introduction to African Studies,1,AAAS010: Introduction to African-American Studies,88
12378,AAAS012: Race and Slavery in U.S. History,1,HIST016: Race and Slavery in US History,88
16654,AAAS015: History of Africa since 1800,1,HIST066: History of Africa since 1800,86
...,...,...,...,...
4784251,WRIT003: Composition and Research: II,1,WRIT002: Composition and Research: I,96
4788684,WRIT007.14: Learning from Dartmouth,1,WRIT 07.14: Learning from Dartmouth,97
4794514,WRIT007.22: Technology and Sport,1,WRIT 07.22: Technology and Sport,97
4803507,WRIT007.27: Philosophy of Science,1,PHIL027: Philosophy of Science,86


In [14]:
# adjusts professor name based on fuzzy matching results
def fuzzmatcher2(row):
    if combined_dataframe['fuzzy_match_name'].str.startswith(row['Course']).any():
        return list(combined_dataframe[combined_dataframe['fuzzy_match_name'].str.startswith(row['Course'])]['directory_name'])[0]
    else:
        return row['Course']
    
course_review_db['Course'] = course_review_db.apply(fuzzmatcher2, axis = 1)
course_review_db

Unnamed: 0,Course,Department,Term,Prof,Review
0,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,"Never having studied African history, the mate..."
1,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,I personally am not that interested by the top...
2,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,the professor was uninspiring and was easily i...
3,MUS007: First-Year Seminar,AAAS,07W,Ayo Coly,The pace of this course is a little hectic. It...
4,MUS007: First-Year Seminar,AAAS,07W,Ayo Coly,This class definitely had a lot of reading to ...
...,...,...,...,...,...
56922,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I enjoyed the course and learned a lot about h...
56923,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I think of and read history in a new way now. ...
56924,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me like studying history a bit more.
56925,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me more optimistic. I found the Profes...


In [15]:
course_review_db.to_csv('../data/course_review_db_prof_course_name_cleaned.csv')