In [15]:
from datascience import *
import pandas as pd
import numpy as np
import nltk
import gensim
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import datasets
from sklearn import metrics
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
#Get data
rmp = pd.read_csv("rmf-with-gender.csv")

#Drop rows with missing values
rmp = rmp.dropna()

#Convert dataframe to table 
rmp = Table.from_df(rmp)

In [6]:
#Descriptive statistics 

#Sample size 
N = rmp.num_rows
unique_instructors = rmp.group("Prof_Name").num_rows 

#Gender distribution 
num_f = sum ([1 for i in rmp.column("Prof_Gender") if i == "F"])
prop_f = num_f / N

num_m = sum ([1 for i in rmp.column("Prof_Gender") if i == "M"])
prop_m = num_m / N

#Rating distribution across gender
cross_tab = rmp.group(["Prof_Gender", "Rating_Type"])
cross_tab = cross_tab.with_column("Proportion", cross_tab.column(2)/N).sort("Proportion", descending = True)

cross_tab_f = cross_tab.where("Prof_Gender", "F").drop("Proportion")
cross_tab_f = cross_tab_f.with_column("Proportion", cross_tab_f.column("count")/num_f)

cross_tab_m = cross_tab.where("Prof_Gender", "M").drop("Proportion")
cross_tab_m = cross_tab_m.with_column("Proportion", cross_tab_m.column("count")/num_m)

In [7]:
#Preprocessing function

def clean (text): 
    from nltk.corpus import stopwords
    from string import punctuation
    from nltk.tokenize import word_tokenize
    
    stopwords = stopwords.words("english")
    punctuation = list(punctuation)
    gender_pronouns = ["he", "him", "his", "she", "her", "hers", "man", "woman", "himself", "herself"]
    
    #Tokenize
    tokens = [word_tokenize(i) for i in text]
    
    #Convert all words to lowercase
    lower_case = [[i.lower() for i in list] for list in tokens]
    
    #Remove stop words, punctuation, and gender indicators 
    no_stopwords = [[i for i in list if i not in stopwords] for list in lower_case]
    no_punctuation = [[i for i in list if i not in punctuation] for list in no_stopwords]
    no_gender = [[i for i in list if i not in gender_pronouns] for list in no_punctuation]
    
    #Stem
    snow = nltk.stem.SnowballStemmer('english')
    stem = [[snow.stem(i) for i in list] for list in no_gender]
    
    #Combine words into single comment
    cleanText = [" ".join(i) for i in stem] 
    
    return cleanText

In [8]:
#Clean and isolate evaluations and ratings
evaluations = clean(rmp.column("Comment"))
ratings = rmp.column("Rating_Type")

#Shuffle data
evaluations, ratings = shuffle(evaluations, ratings, random_state=1)

#Split data into training and testing
evalTrain, evalTest, rateTrain, rateTest = train_test_split(evaluations, ratings, test_size=0.3, random_state=50)

In [9]:
#Build the classifier

#Tfidf values
tfidf = TfidfVectorizer()
tfidf.fit(evaluations)
evalTrain = tfidf.transform(evalTrain)
evalTest = tfidf.transform(evalTest)

In [10]:
#Logit model
logit = LogisticRegression()
logit.fit(evalTrain, rateTrain)
logit.score(evalTest, rateTest)

0.59363636363636363

In [None]:
#Fairness outcomes: logit model

In [13]:
#K-nearest neighbor model 
KNN = KNeighborsClassifier()
KNN.fit(evalTrain, rateTrain)
KNN.score(evalTest, rateTest)

0.53454545454545455

In [None]:
#Fairness outcomes: KNN model

In [17]:
#Support vector machine model 
svm = svm.SVC()
svm.fit(evalTrain, rateTrain)
svm.score(evalTest, rateTest)

0.53272727272727272

In [None]:
#Fairness outcomes: SVM model