In [1]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import common_texts, get_tmpfile, datapath
import numpy as np
import pickle
#Library for WEAT experiments:
from responsibly.we import calc_single_weat
from responsibly.we.data import WEAT_DATA

Enter the filepaths for the trained models below:

In [2]:
allredd2014path = 'Reddit/All comments/2014/word2vec-2014-all.model'
allredd2019path = 'Reddit/All comments/2019/word2vec-2019-all.model'
newsredd2014path = 'Reddit/News comments/2014/word2vec-2014-news.model'
newsredd2019path = 'Reddit/News comments/2019/word2vec-2019-news.model'
CNNpath = 'News/CNN/word2vec-cnn.model'
DailyMailpath = 'News/Daily Mail/word2vec-dm.model'

Load the models here:

In [3]:
allredd2014 = Word2Vec.load(allredd2014path)
allredd2019 = Word2Vec.load(allredd2019path)
newsredd2014 = Word2Vec.load(newsredd2014path)
newsredd2019 = Word2Vec.load(newsredd2019path)
cnn = Word2Vec.load(CNNpath)
dm = Word2Vec.load(DailyMailpath)
models = [allredd2014, allredd2019, newsredd2014, newsredd2019, cnn, dm]

Load the target and attribute words

Gender Bias Set 1:

In [4]:
malenames = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
femalenames = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']
career = ['executive', 'management', 'professional', 'corporation', 'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family', 'cousins', 'marriage', 'wedding', 'relatives']
#Dictionary form for Responsibly Library use:
m1 = {"name": "Male Names", "words": malenames}
f1 = {"name": "Female Names", "words": femalenames}
c = {"name": "Career Words", "words": career}
f = {"name": "Family Words", "words": family}

Gender Bias Set 2:

In [5]:
math = ['math', 'algebra', 'geometry', 'calculus', 'equations', 'fraction', 'numbers', 'addition']
arts = ['poetry', 'art', 'dance', 'literature', 'novel', 'symphony', 'drama', 'sculpture']
malepronouns = ['male', 'man', 'boy', 'brother', 'he', 'him', 'his', 'son']
femalepronouns = ['female', 'woman', 'girl', 'sister', 'she', 'her', 'hers', 'daughter']
#Dictionary form for Responsibly Library use:
m = {"name": "Math Words", "words": math}
ar = {"name": "Art Words", "words": arts}
m2 = {"name": "Male Pronouns", "words": malepronouns}
f2 = {"name": "Female Pronouns", "words": femalepronouns}

Gender Bias Set 3:

In [6]:
science = ['science', 'technology', 'physics', 'chemistry', 'einstein', 'nasa', 'experiment', 'astronomy']
arts2 = ['poetry', 'art', 'shakespeare', 'dance', 'literature', 'novel', 'symphony', 'drama']
malepronouns2 = ['brother', 'father', 'uncle', 'grandfather', 'son', 'he', 'his', 'him']
femalepronouns2 = ['sister', 'mother', 'aunt', 'grandmother', 'daughter', 'she', 'hers', 'her']
#Dictionary form for Responsibly Library use:
s = {"name": "Science Words", "words": science}
ar2 = {"name": "Art Words 2", "words": arts2}
m3 = {"name": "Male Pronouns", "words": malepronouns2}
f3 = {"name": "Female Pronouns", "words": femalepronouns2}

Race Bias Word Sets:

In [7]:
whitenames = ['harris', 'nelson', 'robinson', 'thompson', 'moore', 'wright', 'anderson', 'clark', 'jackson'] 
hispanicnames = ['cruz', 'castro', 'garcia', 'torres', 'martinez', 'gonzalez', 'sanchez', 'lopez', 'rodriguez']
asiannames = ['cho', 'wong', 'tang', 'hong', 'kim', 'chen', 'ng', 'wu', 'liu']
occupationsH = ['housekeeper', 'artist', 'janitor', 'dancer', 'mechanic', 'photographer', 'baker', 'cashier'] 
occupationsA = ['professor', 'official', 'secretary', 'conductor', 'physicist', 'scientist', 'chemist', 'accountant'] 
occupationsW = ['smith', 'blacksmith', 'surveyor', 'sheriff', 'weaver', 'administrator', 'statistician', 'clergy'] 
#Dictionary form for Responsibly Library use:
h = {"name": "Hispanic Names", "words": hispanicnames}
a = {"name": "Asian Names", "words": asiannames}
w = {"name": "White Names", "words": whitenames}
oh = {"name": "Rated Hispanic Occupations", "words": occupationsH}
oa = {"name": "Rated Asian Occupations", "words": occupationsA}
ow = {"name": "Rated White Occupations", "words": occupationsW}

Perfrom the WEAT trials, then load the results into arrays

In [8]:
wa = []
for i in models:
    tmp1 = calc_single_weat(i.wv, ow, oa, w, a, with_pvalue=True)
    tmp2 = [tmp1['s']] + [tmp1['d']] + [tmp1['p']]
    wa = wa + [tmp2]

In [9]:
wh = []
for i in models:
    tmp1 = calc_single_weat(i.wv, ow, oh, w, h, with_pvalue=True)
    tmp2 = [tmp1['s']] + [tmp1['d']] + [tmp1['p']]
    wh = wh + [tmp2]

In [10]:
ha = []
for i in models:
    tmp1 = calc_single_weat(i.wv, oh, oa, h, a, with_pvalue=True)
    tmp2 = [tmp1['s']] + [tmp1['d']] + [tmp1['p']]
    ha = ha + [tmp2]

In [11]:
gb1 = []
for i in models:
    tmp1 = calc_single_weat(i.wv, c, f, m1, f1, with_pvalue=True)
    tmp2 = [tmp1['s']] + [tmp1['d']] + [tmp1['p']]
    gb1 = gb1 + [tmp2]

In [12]:
gb2 = []
for i in models:
    tmp1 = calc_single_weat(i.wv, m, ar, m2, f2, with_pvalue=True)
    tmp2 = [tmp1['s']] + [tmp1['d']] + [tmp1['p']]
    gb2 = gb2 + [tmp2]

In [13]:
gb3 = []
for i in models:
    tmp1 = calc_single_weat(i.wv, s, ar2, m2, f2, with_pvalue=True)
    tmp2 = [tmp1['s']] + [tmp1['d']] + [tmp1['p']]
    gb3 = gb3 + [tmp2]

Organize the results into tables

In [14]:
import pandas as pd

In [15]:
data1 = np.array([['Gender Bias 1','Test Statistic','Effect Size', 'P-value'],
                ['All-reddit 2014',gb1[0][0],gb1[0][1],gb1[0][2]],
                ['All-reddit 2019',gb1[1][0],gb1[1][1],gb1[1][2]],
                ['News reddit 2014',gb1[2][0],gb1[2][1],gb1[2][2]],
                ['News reddit 2019',gb1[3][0],gb1[3][1],gb1[3][2]],
                ['CNN',gb1[4][0],gb1[4][1],gb1[4][2]],
                ['Daily Mail',gb1[5][0],gb1[5][1],gb1[5][2]]])
table1 = pd.DataFrame(data=data1[1:,1:],index=data1[1:,0],columns=data1[0,1:])
table1.style
#table1.to_csv(r'gb1.csv', index = False)

Unnamed: 0,Test Statistic,Effect Size,P-value
All-reddit 2014,1.2413517162203789,1.8197216,0.0
All-reddit 2019,1.0437299720942974,1.9015805,0.0
News reddit 2014,0.8281081307213753,1.6173909,0.0001554001554001
News reddit 2019,0.5410954803228378,1.1161308,0.0126651126651126
CNN,1.9621651098132131,1.848821,0.0
Daily Mail,1.8259299732744692,1.8966514,0.0


In [16]:
data2 = np.array([['Gender Bias 2','Test Statistic','Effect Size', 'P-value'],
                ['All-reddit 2014',gb2[0][0],gb2[0][1],gb2[0][2]],
                ['All-reddit 2019',gb2[1][0],gb2[1][1],gb2[1][2]],
                ['News reddit 2014',gb2[2][0],gb2[2][1],gb2[2][2]],
                ['News reddit 2019',gb2[3][0],gb2[3][1],gb2[3][2]],
                ['CNN',gb2[4][0],gb2[4][1],gb2[4][2]],
                ['Daily Mail',gb2[5][0],gb2[5][1],gb2[5][2]]])
table2 = pd.DataFrame(data=data2[1:,1:],index=data2[1:,0],columns=data2[0,1:])
table2.style

Unnamed: 0,Test Statistic,Effect Size,P-value
All-reddit 2014,0.0810539536178112,0.29211283,0.2972804972804972
All-reddit 2019,0.3316378146409988,1.0507628,0.0194250194250194
News reddit 2014,-0.1732005362864583,-0.8445381,0.9497280497280496
News reddit 2019,-0.0123310266062617,-0.057133097,0.5384615384615384
CNN,0.0645099114626646,0.383465,0.2351204351204351
Daily Mail,0.1606983952224254,1.0351095,0.0192696192696192


In [17]:
data3 = np.array([['Gender Bias 3','Test Statistic','Effect Size', 'P-value'],
                ['All-reddit 2014',gb3[0][0],gb3[0][1],gb3[0][2]],
                ['All-reddit 2019',gb3[1][0],gb3[1][1],gb3[1][2]],
                ['News reddit 2014',gb3[2][0],gb3[2][1],gb3[2][2]],
                ['News reddit 2019',gb3[3][0],gb3[3][1],gb3[3][2]],
                ['CNN',gb3[4][0],gb3[4][1],gb3[4][2]],
                ['Daily Mail',gb3[5][0],gb3[5][1],gb3[5][2]]])
table3 = pd.DataFrame(data=data3[1:,1:],index=data3[1:,0],columns=data3[0,1:])
table3.style

Unnamed: 0,Test Statistic,Effect Size,P-value
All-reddit 2014,0.2222072072327137,0.69061834,0.0937062937062937
All-reddit 2019,0.2859889259561896,0.7450984,0.0778554778554778
News reddit 2014,-0.0423005174379795,-0.15516394,0.6121989121989122
News reddit 2019,-0.2506593745201826,-0.83299977,0.9445998445998446
CNN,0.1472798865288496,0.82023704,0.0561771561771561
Daily Mail,0.1344876158982515,0.6536375,0.1062937062937063


In [18]:
data4 = np.array([['','Test Statistic','Effect Size', 'P-value'],
                ['All-reddit 2014',wa[0][0],wa[0][1],wa[0][2]],
                ['All-reddit 2019',wa[1][0],wa[1][1],wa[1][2]],
                ['News reddit 2014',wa[2][0],wa[2][1],wa[2][2]],
                ['News reddit 2019',wa[3][0],wa[3][1],wa[3][2]],
                ['CNN',wa[4][0],wa[4][1],wa[4][2]],
                ['Daily Mail',wa[5][0],wa[5][1],wa[5][2]]])
table4 = pd.DataFrame(data=data4[1:,1:],index=data4[1:,0],columns=data4[0,1:])
table4.style

Unnamed: 0,Test Statistic,Effect Size,P-value
All-reddit 2014,-0.304106991738081,-0.31037858,0.7160062160062161
All-reddit 2019,-0.1615535989403724,-0.17222533,0.6204351204351204
News reddit 2014,0.2060131132602691,0.3275757,0.2702408702408702
News reddit 2019,0.2685485314577818,0.40114316,0.2243978243978244
CNN,0.8372728452086449,1.1302943,0.0096348096348096
Daily Mail,0.7077541351318359,0.96916765,0.0214452214452214


In [19]:
data5 = np.array([['','Test Statistic','Effect Size', 'P-value'],
                ['All-reddit 2014',wh[0][0],wh[0][1],wh[0][2]],
                ['All-reddit 2019',wh[1][0],wh[1][1],wh[1][2]],
                ['News reddit 2014',wh[2][0],wh[2][1],wh[2][2]],
                ['News reddit 2019',wh[3][0],wh[3][1],wh[3][2]],
                ['CNN',wh[4][0],wh[4][1],wh[4][2]],
                ['Daily Mail',wh[5][0],wh[5][1],wh[5][2]]])
table5 = pd.DataFrame(data=data5[1:,1:],index=data5[1:,0],columns=data5[0,1:])
table5.style

Unnamed: 0,Test Statistic,Effect Size,P-value
All-reddit 2014,-0.3708099760115146,-1.0345546,0.9785547785547786
All-reddit 2019,-0.0655192788690328,-0.14851746,0.5968919968919969
News reddit 2014,-0.3455642946064472,-0.6926632,0.9069153069153068
News reddit 2019,0.0194605886936187,0.041539636,0.4708624708624708
CNN,0.3032882437109947,0.75652456,0.0729603729603729
Daily Mail,0.5792701914906502,0.77291626,0.0702408702408702


In [20]:
data6 = np.array([['','Test Statistic','Effect Size', 'P-value'],
                ['All-reddit 2014',ha[0][0],ha[0][1],ha[0][2]],
                ['All-reddit 2019',ha[1][0],ha[1][1],ha[1][2]],
                ['News reddit 2014',ha[2][0],ha[2][1],ha[2][2]],
                ['News reddit 2019',ha[3][0],ha[3][1],ha[3][2]],
                ['CNN',ha[4][0],ha[4][1],ha[4][2]],
                ['Daily Mail',ha[5][0],ha[5][1],ha[5][2]]])
table6 = pd.DataFrame(data=data6[1:,1:],index=data6[1:,0],columns=data6[0,1:])
table6.style

Unnamed: 0,Test Statistic,Effect Size,P-value
All-reddit 2014,-0.1276808269321918,-0.26442394,0.6862470862470863
All-reddit 2019,-0.0107746738940477,-0.027974308,0.5201243201243201
News reddit 2014,0.094555165618658,0.19057792,0.3608391608391608
News reddit 2019,-0.0299824178218841,-0.108522706,0.5807303807303807
CNN,0.5796383991837502,0.90987927,0.0383838383838383
Daily Mail,0.0644894130527973,0.1623503,0.3799533799533799
