In [1]:
# reads in course reviews for professor names
import pandas as pd
course_review_db = pd.read_csv('../data/course_review_db_prof_course_name_cleaned.csv', index_col = 0)
course_review_db

Unnamed: 0,Course,Department,Term,Prof,Review
0,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,"Never having studied African history, the mate..."
1,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,I personally am not that interested by the top...
2,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,the professor was uninspiring and was easily i...
3,MUS007: First-Year Seminar,AAAS,07W,Ayo Coly,The pace of this course is a little hectic. It...
4,MUS007: First-Year Seminar,AAAS,07W,Ayo Coly,This class definitely had a lot of reading to ...
...,...,...,...,...,...
56922,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I enjoyed the course and learned a lot about h...
56923,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I think of and read history in a new way now. ...
56924,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me like studying history a bit more.
56925,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me more optimistic. I found the Profes...


In [2]:
import re

def words_in_string(word_list, a_string):
    return len(set(word_list).intersection(a_string.split())) > 0

# assigns professor gender based on the pronouns used within review
def determine_prof_gender(row):
    raw_text = row['Review'].lower()
    
    male_words = ['he', 'his', 'himself']
    female_words = ['she', 'her', 'herself']
    
    if words_in_string(male_words, raw_text):
        return 'Male'
    elif words_in_string(female_words, raw_text):
        return 'Female'
    else:
        return None
    
course_review_db['Prof Gender'] = course_review_db.apply(determine_prof_gender, axis = 1)
course_review_db

Unnamed: 0,Course,Department,Term,Prof,Review,Prof Gender
0,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,"Never having studied African history, the mate...",Male
1,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,I personally am not that interested by the top...,
2,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,the professor was uninspiring and was easily i...,Male
3,MUS007: First-Year Seminar,AAAS,07W,Ayo Coly,The pace of this course is a little hectic. It...,
4,MUS007: First-Year Seminar,AAAS,07W,Ayo Coly,This class definitely had a lot of reading to ...,Female
...,...,...,...,...,...,...
56922,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I enjoyed the course and learned a lot about h...,
56923,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I think of and read history in a new way now. ...,
56924,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me like studying history a bit more.,
56925,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me more optimistic. I found the Profes...,


In [3]:
# counts how many times a professor was tagged as for each gender 
prof_directory = course_review_db.groupby(['Prof', 'Prof Gender']).count().reset_index()
prof_directory = prof_directory.groupby('Prof', group_keys=False).apply(lambda x: x.loc[x['Course'].idxmax()])
prof_directory = prof_directory.drop(columns=['Prof']).reset_index()
prof_directory

Unnamed: 0,Prof,Prof Gender,Course,Department,Term,Review
0,A. Kevin,Male,26,26,26,26
1,A. Kevin Reinhart,Male,4,4,4,4
2,Aaron Spink,Male,1,1,1,1
3,Abigail A.,Female,33,33,33,33
4,Ada Cohen,Female,10,10,10,10
...,...,...,...,...,...,...
1508,Zaneta M. Thayer,Female,13,13,13,13
1509,Zenghong Chen,Female,4,4,4,4
1510,Zenovia Toloudi,Female,3,3,3,3
1511,Zi Chen,Male,6,6,6,6


In [4]:
# sample professor that was classified as both female and male, but majority female
prof_directory = course_review_db.groupby(['Prof', 'Prof Gender']).count().reset_index()
prof_directory[prof_directory.Prof == 'Lynda Boose']

Unnamed: 0,Prof,Prof Gender,Course,Department,Term,Review
980,Lynda Boose,Female,25,25,25,25
981,Lynda Boose,Male,1,1,1,1


In [5]:
# reassigns professors with gender count that is the highest
course_review_db = pd.merge(course_review_db,prof_directory[['Prof','Prof Gender']],on='Prof', how='left')
course_review_db = course_review_db.drop(columns=['Prof Gender_x'])
course_review_db = course_review_db.rename(columns={"Prof Gender_y": "Prof Gender"})
course_review_db

Unnamed: 0,Course,Department,Term,Prof,Review,Prof Gender
0,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,"Never having studied African history, the mate...",Male
1,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,I personally am not that interested by the top...,Male
2,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,the professor was uninspiring and was easily i...,Male
3,MUS007: First-Year Seminar,AAAS,07W,Ayo Coly,The pace of this course is a little hectic. It...,Female
4,MUS007: First-Year Seminar,AAAS,07W,Ayo Coly,This class definitely had a lot of reading to ...,Female
...,...,...,...,...,...,...
107093,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I enjoyed the course and learned a lot about h...,Female
107094,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I think of and read history in a new way now. ...,Female
107095,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me like studying history a bit more.,Female
107096,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me more optimistic. I found the Profes...,Female


In [6]:
course_review_db[(course_review_db['Prof Gender'] != 'Male') & (course_review_db['Prof Gender'] != 'Female')]

Unnamed: 0,Course,Department,Term,Prof,Review,Prof Gender
101,REL054: African American Religion and Culture ...,AAAS.,17F,Vaughn A. Booker,Booker is a good prof and the class is well or...,
102,REL061: Religion and the Civil Rights Movement,AAAS,22W,Vaughn A. Booker,"Very heavy readings; however, I believe you ca...",
151,AAAS034: Early Black American Literature,AAAS,18S,Chante Mouton Kinyon,"Really interesting class. A ton of reading, bu...",
232,"AAAS088.14: Cities, Subjects & Sonic Africa",AAAS.,17F,Francesca Inglese,"If this course is offered, it's a must-take. G...",
272,AMES004: Introduction to Arabic Culture,AMES,07S,Min S.,but she's a visiting prof that's not going to ...,
...,...,...,...,...,...,...
102118,QSS030.01: Sports Analytics,QSS,19F,Philip Hanlon,"This was a very interesting course, and I'd li...",
102119,QSS030.01: Sports Analytics,QSS,19F,Philip Hanlon,Was awesome and reinforced my decision to be a...,
102120,QSS030.01: Sports Analytics,QSS,19F,Philip Hanlon,didn't expect the proofs,
102121,QSS030.01: Sports Analytics,QSS,19F,Philip Hanlon,g,


In [7]:
import random
from nltk.corpus import names
import nltk

def gender_features(word):
    return {'last_letter':word[-1]}

# adds already tagged male professors to data set
males = set([x.split(' ')[0] for x in course_review_db[course_review_db['Prof Gender'] == 'Male']['Prof']] )
males = [m for m in males if len(m) > 1 ]

# adds already tagged female professors to data set
females = set([x.split(' ')[0] for x in course_review_db[course_review_db['Prof Gender'] == 'Female']['Prof']] )
females = [f for f in females if len(f) > 1 ]

# uses names from nltk corpus to create classifier for gender 
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
Dartmouth_labeled_names =  ([(name, 'male') for name in males] + [(name, 'female') for name in females])
labeled_names = labeled_names + Dartmouth_labeled_names
random.shuffle(labeled_names)

# creates training and testing set for names 
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [8]:
# uses Naive Bayes model to clasify professor gender

def determine_prof_gender2(row):
    try:
        prof_first_name = row['Prof'].split(' ')[0]
        return classifier.classify(gender_features(prof_first_name)).capitalize()
    except:
        return None
    
course_review_db['Prof Gender'] = course_review_db.apply(lambda row: determine_prof_gender2(row) if (row['Prof Gender'] != 'Male') & (row['Prof Gender'] != 'Female') else row['Prof Gender'], axis=1)
course_review_db

Unnamed: 0,Course,Department,Term,Prof,Review,Prof Gender
0,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,"Never having studied African history, the mate...",Male
1,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,I personally am not that interested by the top...,Male
2,MUS007: First-Year Seminar,AAAS,08W,Lindsay F.,the professor was uninspiring and was easily i...,Male
3,MUS007: First-Year Seminar,AAAS,07W,Ayo Coly,The pace of this course is a little hectic. It...,Female
4,MUS007: First-Year Seminar,AAAS,07W,Ayo Coly,This class definitely had a lot of reading to ...,Female
...,...,...,...,...,...,...
107093,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I enjoyed the course and learned a lot about h...,Female
107094,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,I think of and read history in a new way now. ...,Female
107095,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me like studying history a bit more.,Female
107096,WRIT 07.24: Past Imperfect,WRIT,19S,Cynthia P. Monroe,It made me more optimistic. I found the Profes...,Female


In [9]:
course_review_db[(course_review_db['Prof Gender'] != 'Male') & (course_review_db['Prof Gender'] != 'Female')]

Unnamed: 0,Course,Department,Term,Prof,Review,Prof Gender


In [10]:
course_review_db.to_csv('../data/course_db_cleaned_final.csv')