# Removing Grammatical Gender From Word Embeddings

In [1]:
import io
import numpy as np
import fasttext
import fasttext.util

In [2]:
def load_vec(emb_path, nmax=200000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [3]:
def save_vec(embeddings, id2word, emb_path):
    with io.open(emb_path, 'w', encoding='utf-8', newline='\n', errors='ignore') as f:
        f.write('200000 300\n')
        for i in range(len(embeddings)):
            f.write(id2word[i]+' ')
            for number in embeddings[i]:
                f.write(str(number)+' ')
            f.write('\n')

In [4]:
lang = "de"

Loading Word Embeddings

In [5]:
my_path = "./data/cc."+lang+".300.vec"
nmax = 1000000

In [6]:
my_embeddings, my_id2word, my_word2id = load_vec(my_path, nmax)

Stimuli List for GenS, GenC, GG-WEAT, FloI, InsW WEAT. FR, ES, DE, PL and IT require approximately 27, 28, 19, 21 and 30 iterations of grammatical gender removal.

In [7]:
if lang == "fr":
    
    num_iter = 27

    # French
    masc = ['argent', 'individu', 'essai', 'combat', 'courage', 'succès', 'désir','lien','contraire','complot']
    fem = ['monnaie', 'personne', 'tentative', 'bagarre', 'bravoure', 'réussite','envie','attache','opposition', 'conspiration']
    man = ['garçon', 'père', 'masculin', 'mari', 'fils', 'oncle', 'homme']
    wom = ['demoiselle', 'féminin', 'tante', 'fille', 'femme', 'mère', 'épouse']

    sci = ["astronomie", "mathématiques", "chimie", "physique", "biologie", "géologie", "ingénierie", "statistiques", "bioingénierie", "biophysique", "biochimie", "écologie", "microbiologie","algèbre","géométrie","télécommunications","ordinateur","astrophysique"]
    hum = ['philosophie', 'humanités', 'art', 'latin', 'littérature', 'musique', 'histoire', "psychologie", "sociologie", "géographie", "anthropologie", "théologie", "linguistique", "journalisme", "archéologie", "danse", "dessin", "peinture"]

    car = ['carrière', 'corporation', 'salaire', 'bureau', 'professionnel', 'gestion', 'entreprise']
    fam = ['mariage', 'domicile', 'parents', 'proches', 'famille', 'maison', 'enfants']
    boy = ['Nicolas', 'Alexandre', 'Guillaume', 'Mathieu', 'Thomas', 'Pierre', 'Emmanuel', 'Jean', 'François']
    girl = ['Céline', 'Marie', 'Sandrine', 'Sophie', 'Caroline', 'Julie', 'Hélène', 'Camille', 'Emilie']
    
    flo = ['trèfle', 'orchidée', 'rose', 'lilas', 'tulipe', 'marguerite', 'lys', 'violette', 'magnolia']
    ins = ['fourmi', 'puces', 'araignée','mouche', 'tarentule', 'abeille', 'cafard', 'moustique', 'frelon']

    instr = ['guitare','trombone','banjo','clarinette','harmonica',
                      'mandoline','trompette','tambour','harpe','hautbois','tuba','cloche','violon',
                      'clavecin','piano','alto','bongo','flûte','cor','saxophone','violon']
    wep = ['flèche','club','pistolet','missile','lance','hache','poignard','harpon','épée',
                      'lame','dynamite','hachette','fusil','réservoir','bombe','couteau',
                      'canon','grenade','masse','fronde','fouet']


    plez = ['caresse', 'liberté', 'santé', 'amour', 'paix', 'acclamation', 'ami', 'paradis', 'fidèle', 'plaisir',
                    'diamant', 'doux', 'honnête', 'chanceux','diplôme', 'cadeau', 'honneur', 'miracle',
                    'famille', 'heureux', 'rire', 'paradis', 'vacances']
    unplez = ['abus', 'accident', 'crasse', 'meurtre', 'maladie', 'accident', 'mort', 'chagrin', 'poison', 'puanteur', 'agression',
                      'désastre','polluer', 'tragédie', 'divorce', 'prison', 'pauvreté', 'laid', 'cancer','pourri', 'vomir', 'agonie', 'prison']

elif lang == "es":
# Spanish
    
    num_iter = 28
    
    masc = ['cordel', 'bosque', 'cojín', 'instrumento','cordero','negocio','triunfo', 'caos','portón','peligro']
    fem = ['cuerda', 'selva', 'almohada', 'herramienta','oveja','empresa','victoria', 'anarquía','puerta','amenaza']
    man = ['hombre', 'niño', 'padre', 'masculino', 'abuelo', 'esposo', 'hijo', 'tio']
    wom = ['niña', 'femenina', 'tía', 'hija', 'esposa', 'mujer', 'madre', 'abuela']

    sci = ["astronomía", "matemáticas", "química", "física", "biología", "geología", "ingeniería", "estadística", "bioingeniería", "biofísica", "bioquímica", "ecología", "microbiología","álgebra","geometría","telecomunicaciones","computadora","astrofísica"]
    hum = ['filosofía', 'humanidades', 'arte', 'literatura', 'música', 'historia', "psicología", "sociología", "geografía", "antropología", "teología", "lingüística", "periodismo", "arqueología", "baile", "dibujo", "pintura", "periodismo"] ##add one more!


    car = ['carrera', 'corporación', 'salario', 'oficina', 'profesional', 'gestión'] 
    fam = ['boda', 'matrimonio', 'familiares', 'hogar', 'niños', 'familia']
    boy = ['Francisco', 'Antonio', 'José', 'Manuel', 'Lucas', 'Hugo', 'Martín', 'Pablo', 'Alejandro']
    girl = ['María', 'Ana', 'Carmen', 'Dolores', 'Lucía', 'Sofía', 'Martina', 'Paula', 'Valeria' ]



    flo= ['orquídea', 'rosa', 'narciso', 'lila', 'tulipán', 'margarita', 'lirio', 'violeta', 'magnolia']
    ins = ['hormiga', 'pulga', 'araña', 'ácaro', 'mosca', 'tarántula', 'abeja', 'cucaracha', 'mosquito']


    instr = ['gaita', 'guitarra', 'laúd', 'trombón', 'banjo', 'clarinete', 'armónica', 'mandolina',
    'trompeta', 'fagot', 'tambor','campana', 'violín', 'clavicémbalo', 'piano', 'viola', 'bongo',
    'flauta', 'cuerno', 'saxofón', 'violín']
    wep = ['flecha', 'club', 'pistola', 'misil', 'lanza', 'hacha', 'daga', 'arpón', 'pistola', 'espada',
    'dinamita','rifle', 'tanque', 'bomba','cuchillo', 'escopeta','cañón', 'granada',
    'maza', 'tirachinas', 'látigo']

    plez = ['caricia', 'libertad', 'salud', 'amor', 'paz', 'alegría', 'amigo', 'cielo', 'leal', 'placer', 'diamante',
    'suave', 'honesto', 'afortunado','diploma', 'regalo', 'honor', 'milagro', 'amanecer', 'familia',
    'feliz', 'risa', 'paraíso']
    unplez = ['abuso', 'accidente', 'asesinato', 'inmundicia', 'enfermedad', 'accidente', 'muerte', 'pena',
    'asalto', 'desastre', 'odio', 'contaminar', 'tragedia', 'divorcio', 'cárcel', 'pobreza', 'feo', 'cáncer', 'matar', 'podrido',
    'vómito', 'agonía', 'prisión']

elif lang == "de":

# German
    num_iter = 19
    
    masc = ['Zorn','Streit', 'Widersacher','Rand', 'Vertrag', 'Tod', 'Globus', 'Auftrag', 'Winter', 'Mund']
    fem =  ['Wut','Auseinandersetzung', 'Gegener','Grenze', 'Vereinbarung', 'Tragödie', 'Welt','Aufgabe', 'Jahreszeit', 'Lippe']
    man = ['Mann', 'Junge', 'Vater', 'Männlich', 'Großvater', 'Ehemann', 'Sohn', 'Onkel']
    wom = ['Mädchen', 'Weiblich', 'Tante', 'Tochter', 'Ehefrau', 'Frau', 'Mutter', 'Großmutter']

    sci = ["Astronomie", "Mathematik", "Chemie", "Physik", "Biologie", "Geologie", "Ingenieurswissenschaften", "Statistik", "Bioingenieurwesen", "Biophysik", "Biochemie", "Ökologie", "Mikrobiologie","Algebra","Geometrie","Telekommunikation","Computer","Astrophysik"] #Ingenieurswissenschaften?
    hum = ['Philosophie', 'Kunst', 'Geschichte','Musik', "Geisteswissenschaften", "Psychologie", "Soziologie", "Geographie", "Anthropologie", "Theologie", "Linguistik", "Journalismus", "Archäologie", "Tanz", "Zeichnung", "Malerei", "Sprachwissenschaften", "Literaturwissenschaften"] #Literaturwissenschaften, Sprachwissenschaften?

    car =  ['Verwaltung', 'Berufstätigkeit', 'Unternehmen', 'Gehalt', 'Büro', 'Verdienst', 'Karriere']
    fam = ['Zuhause', 'Eltern', 'Kinder', 'Familie', 'Hochzeit', 'Ehe', 'Verwandte']

    boy = ['Johannes', 'Lukas', 'Daniel', 'Paul', 'Thomas']
    girl = ['Julia', 'Michaela', 'Anna', 'Laura', 'Sofie']


    flo = ['Orchidee', 'Rose', 'Narzisse', 'Flieder', 'Tulpe', 'Gänseblümchen', 'Lilie', 'Veilchen', 'Magnolie']
    ins=  ['Ameise', 'Floh', 'Spinne', 'Wanze', 'Fliege', 'Tarantel', 'Biene', 'Kakerlake', 'Mücke']

    instr = ['Cello', 'Gitarre', 'Laute', 'Posaune', 'Banjo', 'Klarinette', 'Mundharmonika',
                      'Mandoline', 'Trompete','Fagott', 'Trommel', 'Harfe','Glocke', 'Geige', 'Cembalo', 'Klavier',
                      'Bratsche', 'Flöte', 'Horn','Saxophon', 'Violine']
    wep =  ['Keule', 'Waffe', 'Rakete', 'Speer', 'Axt', 'Dolch', 'Harpune', 'Pistole',
                      'Dynamit', 'Beil','Gewehr', 'Panzer', 'Bombe', 'Schusswaffe', 'Messer', 'Schrotflinte', 'Tränengas', 'Kanone', 'Granate', 'Schleuder', 'Peitsche']

    plez = ['Liebkosung', 'Freiheit', 'Gesundheit', 'Liebe', 'Frieden', 'Jubel', 'Freund', 'Himmel', 'Treue',
                    'Vergnügen', 'Diamant', 'sanft', 'ehrlich','Regenbogen', 'Diplom', 'Geschenk', 'Ehre',
                    'Wunder', 'Sonnenaufgang','Familie', 'glücklich', 'Lachen', 'Paradies']
    unplez = ['Missbrauch', 'Absturz', 'Schmutz', 'Mord', 'Krankheit', 'Unfall', 'Tod', 'Trauer', 'Gift',
                      'Gestank', 'Angriff', 'Katastrophe', 'Hass', 'Umweltverschmutzung', 'Tragödie', 'Scheidung',
                      'Gefängnis', 'Armut', 'hässlich', 'Krebs','töten', 'faul', 'Erbrechen']
    
elif lang == "pl":

# Polish
    num_iter = 21
    masc  = ['duet', 'obszar', 'domek', 'biznes', 'dostatek', 'ruch', 'strumień', 'rozmiar', 'wypadek', 'chaos']
    fem  = ['para', 'strefa', 'chata', 'firma', 'obfitość', 'aktywność', 'rzeka', 'wielkość', 'katastrofa', 'anarchia']
    man = ['mężczyzna', 'chłopiec', 'ojciec', 'nastolatek', 'dziadek', 'mąż', 'syn', 'wujek']
    wom = ['dziewczyna', 'kobieta', 'ciocia', 'córka', 'żona', 'nastolatka', 'matka', 'babcia']

    sci = ["astronomia", "matematyka", "chemia", "fizyka", "biologia", "geologia", "inżynieria", "statystyka", "bioinżynieria", "biofizyka", "biochemia", "ekologia", "mikrobiologia", "algebra", "geometria", "telekomunikacja", "astrofizyka", "komputerowa"]
    hum = ['filozofia', 'polski', 'sztuka', 'łacina','muzyka', 'historia',"literatura", "psychologia", "socjologia", "geografia", "antropologia", "teologia", "językoznawstwo", "dziennikarstwo", "archeologia", "taniec", "rysunek", "malarstwo"]

    car = ['kariera', 'korporacja', 'wynagrodzenie', 'biuro', 'specjalista', 'zarządzanie', 'biznes']
    fam = ['ślub', 'małżeństwo', 'rodzice', 'krewni', 'rodzina', 'dom', 'dzieci']
    boy = ['Jakub', 'Mateusz', 'Michał', 'Patryk', 'Dawid', 'Kamil', 'Piotr', 'Szymon', 'Paweł']
    girl = ['Natalia', 'Aleksandra', 'Wiktoria', 'Julia', 'Weronika', 'Karolina', 'Paulina', 'Patrycja', 'Katarzyna']


    flo = ['orchidea', 'róża', 'narcyz', 'liliowy', 'tulipan', 'stokrotka', 'lilia', 'fiołek', 'magnolia']
    ins= ['pchła','pająk','pluskwa','latać','tarantula','pszczoła','karaluch','komar','szerszeń']

    instr = ['wiolonczela','gitara','flet','lutnia','puzon','banjo','klarnet','harmonijka',
                      'mandolina','trąbka','fagot','bęben','harfa','obój','tuba','dzwon','skrzypce','klawesyn',
                      'fortepian','altówka','bongo']

    wep = ['strzałka','buława','strzelba','pocisk','włócznia','topór','harpun','pistolet',
                      'miecz','nóż','dynamit','toporek','karabin','czołg','bomba','ostrze',
                      'armata','granat','buzdygan','proca','bat']

    plez = ['swoboda','zdrowie','miłość','dyplom','pokój','przyjemność','dopingować',
                    'przyjaciel','niebiosa','wierny','diament','delikatny','uczciwy','tęcza',
                    'podarunek','honor','cud','rodzina','szczęśliwy','śmiech','raj','wakacje','świt']

    unplez = ['nadużycie' , 'wypadek' , 'brud' , 'zabójstwo' , 'choroba' , 'awaria' , 'śmierć' ,
                      'smutek' , 'trucizna' , 'smród' , 'atak' , 'katastrofa' , 'nienawiść' , 'zanieczyszczać' ,
                      'tragedia' , 'rozwód' , 'więzienie' , 'bieda' , 'brzydki' , 'rak' , 'zgniły' , 'wymiociny' ,
                      'agonia']

elif lang == "it":
    
    num_iter = 30
    
    # Italian
    masc = ['confine', 'modo','lido','appartamento','paio','vagone','carbone','viaggio','addome','dolore']
    fem = ['frontiera', 'maniera','spiaggia','casa','coppia','carrozza','carbonella','gita','pancia','agonia']
    man = ['uomo', 'padre', 'maschio', 'nonno', 'marito', 'zio']
    wom = ['femmina', 'zia', 'moglie', 'donna', 'madre', 'nonna']

    sci = ["astronomia", "matematica", "chimica", "fisica", "biologia", "geologia", "ingegneria", "statistica", "bioingegneria", "biofisica", "biochimica", "ecologia", "microbiologia","algebra","geometria","telecomunicazioni","computer","astrofisica"]
    hum = ['filosofia', 'umanesimo', 'arte', 'letteratura', 'italiano', 'musica', 'storia', "psicologia", "sociologia", "geografia", "antropologia", "teologia", "linguistica", "giornalismo", "archeologia", "danza", "disegno", "pittura"]


    car = ['carriera', 'società', 'stipendio', 'ufficio', 'professionale', 'gestione']
    fam = ['matrimonio', 'genitori', 'parenti', 'famiglia', 'casa', 'figli'] 
    boy = ['Marco', 'Alessandro', 'Giuseppe', 'Giovanni', 'Roberto', 'Stefano', 'Francesco', 'Mario', 'Luigi'] 
    girl = ['Anna', 'Maria', 'Sara', 'Laura', 'Giulia', 'Rosa','Angela', 'Sofia', 'Stella']

    flo = ['orchidea', 'rosa', 'narciso', 'lilla', 'tulipano', 'margherita', 'giglio', 'viola', 'magnolia']
    ins = ['pulce', 'ragno', 'cimice', 'mosca', 'tarantola', 'ape', 'scarafaggio', 'zanzara', 'calabrone']

    instr = ['trombone', 'banjo', 'clarinetto', 'armonica',
                      'mandolino', 'tromba', 'fagotto', 'tamburo', 'arpa', 'oboe', 'tuba', 'campana', 'violino', 'clavicembalo',
                      'pianoforte', 'viola', 'bongo', 'flauto', 'corno', 'sassofono', 'violino']
    wep = ['ascia', 'bastone', 'lancia', 'lancia', 'fucile', 'lancia', 'lancia', 'lancia', 'missile', 'pugnale',
                      'pistola', 'dinamite', 'spada', 'serbatoio', 'bomba', 'pistola', 'cannone', 'granata', 'mazza', 'fionda', 'frusta']

    plez = ['libertà', 'salute', 'amore', 'pace', 'allegria', 'amico', 'cielo', 'leale', 'piacere', 'diamante',
    'gentile', 'onesto', 'fortunato', 'arcobaleno', 'diploma', 'dono', 'onore', 'miracolo', 'alba', 'famiglia',
    'felice', 'risate', 'paradiso']
    unplez = ['abuso', 'crash', 'sporcizia', 'omicidio', 'malattia', 'incidente', 'morte', 'dolore', 'veleno',
    'assalto', 'disastro', 'odio', 'inquinare', 'tragedia', 'divorzio', 'carcere', 'povertà', 'brutto', 'cancro', 'uccidere', 'marcio',
    'vomito', 'agonia']

elif lang == "en":


#English

    sci = ["astronomy", "math", "chemistry", "physics", "biology", "geology", "engineering", "statistics", "bioengineering", "biophysics", "biochemistry", "ecology", "microbiology", "algebra", "geometry","telecommunications", "computer", "astrophysics"]
    hum = ["history", "arts", "humanities", "english", "philosophy", "music", "literature", "psychology", "sociology", "geography", "anthropology", "theology", "linguistics", "journalism","archaeology","dancing","drawing", "painting"]

    man = ["man", "son", "father", "boy", "uncle", "grandpa", "husband", "male"]
    wom = ["mother", "wife", "aunt", "woman", "girl", "female", "grandma", "daughter"]

    car = ['career', 'corporation', 'salary', 'office', 'professional', 'management', 'business'] 
    fam = ['wedding', 'marriage', 'parents', 'relatives', 'family', 'home', 'children']
    boy = ['Ben', 'Paul', 'Daniel', 'John', 'Jeffrey']
    girl = ['Rebecca', 'Michelle', 'Emily', 'Julia', 'Anna']

    flo = ['clover', 'orchid', 'rose','lilac', 'tulip', 'daisy', 'lily', 'violet', 'magnolia']
    ins = ['ant', 'flea', 'spider','fly', 'tarantula', 'bee', 'cockroach', 'mosquito', 'hornet']

    instr = ['guitar', 'lute', 'trombone', 'banjo', 'clarinet', 'harmonica', 'mandolin', 'trumpet',
                      'bassoon', 'drum','harp','bell', 'fiddle', 'harpsichord', 'piano', 'viola', 'bongo', 'flute',
                      'horn', 'saxophone', 'violin']
    wep =['arrow', 'club', 'gun', 'missile', 'spear', 'axe', 'dagger', 'harpoon', 'pistol', 'sword','dynamite',
                      'rifle','tank', 'bomb', 'firearm', 'knife', 'teargas', 'cannon', 'grenade','slingshot', 'whip']

    plez  = ['freedom', 'health', 'love', 'peace', 'cheer', 'friend', 'heaven', 'loyal', 'pleasure', 'diamond',
                     'gentle', 'honest','lucky', 'rainbow', 'diploma', 'gift', 'honor', 'miracle','family', 'happy', 'laughter',
                     'paradise', 'vacation']
    unplez = ['abuse','filth' , 'murder' , 'sickness' ,'death', 'grief', 'poison', 'stink', 'assault',
                      'disaster', 'hatred','pollute', 'tragedy', 'divorce', 'jail', 'poverty', 'ugly', 'cancer', 'kill', 'rotten',
                      'vomit', 'agony', 'prison']

print(len(masc))
print(len(fem))
print("***************")
print(len(man))
print(len(wom))
print("***************")
print(len(sci))
print(len(hum))
print("***************")
print(len(car))
print(len(fam))
print("***************")
print(len(boy))
print(len(girl))

10
10
***************
8
8
***************
18
18
***************
7
7
***************
5
5


# Original WEAT

In [8]:
def perform_weat(target1, target2, attribute1, attribute2, WEAT_gender_removed_FR_embeddings_2, my_word2id):
    
    myOperations1 = operations(target1,target2,attribute1,attribute2,True, 100000, WEAT_gender_removed_FR_embeddings_2, my_word2id, 'normal')

    results = myOperations1.getPValueAndEffect()
    print("p-value: ", results[0] ,"  ---  effectSize: ", results[1] )

    return results

def cos_sim(emb1, emb2):
    return((emb1/np.linalg.norm(emb1)).dot(emb2/np.linalg.norm(emb2)))


In [9]:
from utils import operations

d_gg = []
p_gg = []

d_gens = []
p_gens = []

d_genc = []
p_genc = []

accur = []

gonen_same = []
gonen_diff = []


result = perform_weat(flo, ins, plez, unplez, my_embeddings, my_word2id)
result = perform_weat(instr, wep, plez, unplez, my_embeddings, my_word2id)

The difference of means is  0.05616007159273485
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.0003873441252125742   ---  effectSize:  1.5851024443105546
The difference of means is  0.05727969089286318
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  5.6323323782692114e-08   ---  effectSize:  1.6363436629881238


In [10]:
gg_result = perform_weat(masc, fem, man, wom, my_embeddings, my_word2id)
d_gg.append(gg_result[1])
p_gg.append(gg_result[0])
print("**************")   
gens_result = perform_weat(sci, hum, man, wom, my_embeddings, my_word2id)
d_gens.append(gens_result[1])
p_gens.append(gens_result[0])
print("**************")   
genc_result = perform_weat(car, fam, boy, girl, my_embeddings, my_word2id)
d_genc.append(genc_result[1])
p_genc.append(genc_result[0])

The difference of means is  0.08089643125223576
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.0003152570268201149   ---  effectSize:  1.5284768797273471
**************
The difference of means is  0.013421155976923223
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.06842652533483218   ---  effectSize:  0.49749811008198386
**************
The difference of means is  0.06201302705508041
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.008447225756614385   ---  effectSize:  1.2754780237621695


In [11]:
def get_cos_sim(word1, src_emb, src_id2word, word2, tgt_emb, tgt_id2word):
    word2id_1 = {v: k for k, v in src_id2word.items()}
    emb_1 = src_emb[word2id_1[word1]]
    word2id_2 = {v: k for k, v in tgt_id2word.items()}
    emb_2 = tgt_emb[word2id_2[word2]] 
    return((emb_1/np.linalg.norm(emb_1)).dot(emb_2/np.linalg.norm(emb_2)))

def get_cos_sim_neutral(word, emb, id2word):
    word2id = {v: k for k, v in id2word.items()}
    word_emb = emb[word2id[word]]
    return((neutral_embedding/np.linalg.norm(neutral_embedding)).dot(word_emb/np.linalg.norm(word_emb)))

Loading 5,000 feminine and masculine nouns. German is case-sensetive so special treatment.

In [12]:
FR_expanded_m_nouns = []
FR_expanded_f_nouns = []
count = 0
pth1 = "./data/nouns/"+lang+"-masc-v2.txt"
with open(pth1, "r") as f:
    for line in f:
        word_raw = line.strip()
        word = word_raw[0].upper()+word_raw[1:]
        if word not in FR_expanded_m_nouns or word_raw not in FR_expanded_m_nouns:
            
            if "de" in my_path:
                if word in my_word2id.keys():
                    FR_expanded_m_nouns.append(word)
                elif word_raw in my_word2id.keys():
                    FR_expanded_m_nouns.append(word_raw)
            else:
                if word_raw in my_word2id.keys():
                    FR_expanded_m_nouns.append(word_raw)
                


print(len(FR_expanded_m_nouns))

pth1 = "./data/nouns/"+lang+"-fem-v2.txt"
with open(pth1, "r") as f:
    for line in f:
        word_raw = line.strip()
        word = word_raw[0].upper()+word_raw[1:]
        if word not in FR_expanded_f_nouns or word_raw not in FR_expanded_f_nouns:
            
            if "de" in my_path:
                if word in my_word2id.keys():
                    FR_expanded_f_nouns.append(word)
                elif word_raw in my_word2id.keys():
                    FR_expanded_f_nouns.append(word_raw)
            else:
                if word_raw in my_word2id.keys():
                    FR_expanded_f_nouns.append(word_raw)

print(len(FR_expanded_f_nouns))

FR_grammar_pair_expanded = []
for f,m in zip(FR_expanded_f_nouns, FR_expanded_m_nouns):
    pair = [f,m]
    FR_grammar_pair_expanded.append(pair)
    
len(FR_grammar_pair_expanded)

14066
16036


14066

In [13]:
from sklearn.svm import LinearSVC

In [14]:
len(FR_grammar_pair_expanded)

14066

Training SVC to learn the difference between the feminine and masculine grammatical gender.

In [15]:
from sklearn.model_selection import cross_val_score
clf_kfold = LinearSVC()
shortened_3000 = FR_grammar_pair_expanded[:3000]

SVC can predict grammatical gender with a hugh accuracy. Grammatical gender direction is the coefficients of the SVC.

In [16]:
X_3000 = np.zeros((6000, 300))
counter = 0
for pair in shortened_3000:
    X_3000[counter] = my_embeddings[my_word2id[pair[0]]]
    counter += 1
    X_3000[counter] = my_embeddings[my_word2id[pair[1]]]
    counter += 1
    
from sklearn.preprocessing import normalize #machine learning algorithm library

X_3000=normalize(X_3000,axis=0)

y_3000 = np.tile([1,2],3000)
clf_3000 = LinearSVC(C = 10)
clf_3000.fit(X_3000, y_3000)
acc = clf_3000.score(X_3000,y_3000)
print("Initial classification accuracy is", acc)
accur.append(acc)
coef = clf_3000.coef_
FR_grammar_gender_direction_3000 = np.reshape(coef/np.linalg.norm(coef), (300,))

gg = np.reshape(coef/np.linalg.norm(coef), (300,))

Initial classification accuracy is 0.9373333333333334


Using LDA to find the grammatical gender direction.

In [17]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# clf_LDA = LinearDiscriminantAnalysis(n_components = 1)

# scores_3000 = cross_val_score(clf_LDA, X_3000, y_3000, cv=5)
# print("Accuracy: %0.4f (+/- %0.5f)" % (scores_3000.mean(), scores_3000.std() * 2))


# clf_LDA.fit(X_3000, y_3000)

# coef = clf_LDA.coef_
# # FR_grammar_gender_direction_LDA_3000 = np.reshape(coef/np.linalg.norm(coef), (300,))
# FR_grammar_gender_direction_LDA_3000 = np.reshape(coef, (300,))

# Projecting Out Grammatical Gender

In [18]:
import numpy as np
from numpy import linalg as LA


def drop(u, v):
    return u - v * u.dot(v) / v.dot(v)

In [19]:
import ValNorm as valnorm
import calcValNorm as calcValNorm
import os

def prep_input(word2id, embedding):

    semanticModel = {}

    for word in word2id:
        semanticModel[word] = embedding[word2id[word]]

    return semanticModel

def read_vocab(file_name):
    f = open(file_name, "r")
    f.readline()
    my_list = []
    for line in f:
        words = line.split(",")
        my_list.append(words[1])

    f.close()
    return my_list

# Gonen et al's Grammatical Gender Neutralization Metric

In [20]:
def read_data(file_name):
    nouns_1 = []
    nouns_2 = []
    with io.open(file_name, "r") as f:
        for line in f:
            words = line[:-1].split(',')
            nouns_1.append(words[0])
            nouns_2.append(words[1])
    return nouns_1, nouns_2


def avg_sim(nouns_1, nouns_2, my_embeddings):
    avg = 0
    count = 0
    for i in range(len(nouns_1)):
        w1 = nouns_1[i]
        w2 = nouns_2[i]
        w1_upp = w1[0].upper()+w1[1:]
        w2_upp = w2[0].upper()+w2[1:]
        cond1 = w1 in my_word2id or w1_upp in my_word2id 
        cond2 = w2 in my_word2id or w2_upp in my_word2id 
        
        if cond1 and cond2:
            count += 1
            if w1 in my_word2id:
                emb1 = my_embeddings[my_word2id[w1]]
            else:
                emb1 = my_embeddings[my_word2id[w1_upp]]
            if w2 in my_word2id:
                emb2 = my_embeddings[my_word2id[w2]]
            else:
                emb2 = my_embeddings[my_word2id[w2_upp]]
                
            avg += cos_sim(emb1, emb2)
        else:
            print("not found ", i)
    print(count)
    return (avg/count)
    

Loading one file which contains pairs of nouns with the same gender, and another file where the pairs of nouns have differring genders

In [21]:
pth1 = "data/nouns/gonen-test/"+lang+"-same.txt"

nouns_3, nouns_4 = read_data(pth1)
avg_it_same = avg_sim(nouns_3, nouns_4, my_embeddings)

pth1 = "data/nouns/gonen-test/"+lang+"-diff.txt"
nouns_3, nouns_4 = read_data(pth1)
avg_it_diff = avg_sim(nouns_3, nouns_4, my_embeddings)


print(avg_it_same )
gonen_same.append(avg_it_same)
print(avg_it_diff)
gonen_diff.append(avg_it_diff)

pth1 = "data/nouns/gonen-test/"+lang+"-same.txt"
nouns_3, nouns_4 = read_data(pth1)

pth1 = "new-data/v2/gonen-test/"+lang+"-diff.txt"
nouns_5, nouns_6 = read_data(pth1)

124
404
0.4759957738647015
0.42397417806738413


# Iterative Grammatical Gender Removal

Testing how much removing grammatical gender direction affects WEAT results and the performance of SVC in predicting grammatical gender. Removing gender direction from all 3000 inanimate words.

In [22]:
WEAT_gender_removed_FR_embeddings_2 = np.zeros((len(my_word2id),300))
 
    
for i in range(len(my_word2id)):
    WEAT_gender_removed_FR_embeddings_2[i] = my_embeddings[i]

for j in range(num_iter):
    print("iteration number ", j+1, "\n")

    #projecting out grammatical gender
    for word in my_word2id:
        word_emb = WEAT_gender_removed_FR_embeddings_2[my_word2id[word]]
        WEAT_gender_removed_FR_embeddings_2[my_word2id[word]] = drop(u=word_emb, v=FR_grammar_gender_direction_3000)        


    X_3000_after = np.zeros((6000, 300))
    counter = 0
    
    #obtaining new embeddings for inanimate nouns
    for pair in shortened_3000:
        X_3000_after[counter] = WEAT_gender_removed_FR_embeddings_2[my_word2id[pair[0]]]
        counter += 1
        X_3000_after[counter] = WEAT_gender_removed_FR_embeddings_2[my_word2id[pair[1]]]
        counter += 1

    y_3000_after = np.tile([1,2],3000)
    
    
#     clf_3000_after = LinearDiscriminantAnalysis(n_components= 1)

    #training SVC to learn grammatical gender hyperplane
    clf_3000_after = LinearSVC(C = 10)
    clf_3000_after.fit(X_3000_after, y_3000_after)
    
    accuracy = clf_3000_after.score(X_3000_after,y_3000_after)
    accur.append(accuracy)
    print("accuracy after gender removal is", accuracy)
    coef_after = clf_3000_after.coef_
    
    #obtaining the new hyperplane
    FR_grammar_gender_direction_3000= np.reshape(coef_after/np.linalg.norm(coef_after), (300,))
    
    #gonen et al. computations
    avg_it_same = avg_sim(nouns_3, nouns_4, WEAT_gender_removed_FR_embeddings_2)
    avg_it_diff = avg_sim(nouns_5, nouns_6, WEAT_gender_removed_FR_embeddings_2)
    gonen_same.append(avg_it_same)
    gonen_diff.append(avg_it_diff)

    #gg-weat computation
    gg_result = perform_weat(masc, fem, man, wom, WEAT_gender_removed_FR_embeddings_2, my_word2id)
    d_gg.append(gg_result[1])
    p_gg.append(gg_result[0])
    
    #genS computation
    gens_result = perform_weat(sci, hum, man, wom, WEAT_gender_removed_FR_embeddings_2, my_word2id)
    d_gens.append(gens_result[1])
    p_gens.append(gens_result[0])
    
    #genC computation
    genc_result = perform_weat(car, fam, boy, girl, WEAT_gender_removed_FR_embeddings_2, my_word2id)
    d_genc.append(genc_result[1])
    p_genc.append(genc_result[0])
    
    print("*********************")

iteration number  1 





accuracy after gender removal is 0.813
124
404
The difference of means is  0.028766127947167312
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.019029686925982392   ---  effectSize:  0.9268111476737503
The difference of means is  0.012355300319734976
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.013526353507255617   ---  effectSize:  0.7364592317492646
The difference of means is  0.058906743978085166
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005472304013062312   ---  effectSize:  1.3609290093020583
*********************
iteration number  2 





accuracy after gender removal is 0.7446666666666667
124
404
The difference of means is  0.014903688695940111
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.11047769969410115   ---  effectSize:  0.5472566872669732
The difference of means is  0.011490799953522073
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.010871826014438257   ---  effectSize:  0.7667552098055803
The difference of means is  0.058637335110358825
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.00542696244728913   ---  effectSize:  1.3651604439795404
*********************
iteration number  3 





accuracy after gender removal is 0.7035
124
404
The difference of means is  0.009629150368530962
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.20207481220607104   ---  effectSize:  0.3722771901828324
The difference of means is  0.011354592402217693
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.009766085217833953   ---  effectSize:  0.7779945852433037
The difference of means is  0.05720022380710686
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005281286457618939   ---  effectSize:  1.3660774714999047
*********************
iteration number  4 





accuracy after gender removal is 0.6765
124
404
The difference of means is  0.007565731908565293
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.25506909287876234   ---  effectSize:  0.2949989265280334
The difference of means is  0.011141436903930262
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.009796503762990905   ---  effectSize:  0.7793618217774235
The difference of means is  0.05631133458686592
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005328606686704651   ---  effectSize:  1.3721308045606626
*********************
iteration number  5 





accuracy after gender removal is 0.6443333333333333
124
404
The difference of means is  0.005193899024110901
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.33029944723222804   ---  effectSize:  0.1973249111121341
The difference of means is  0.011005142234135781
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.009131637703401085   ---  effectSize:  0.7867202352797165
The difference of means is  0.05636508985258985
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005093537723098063   ---  effectSize:  1.3735998883354743
*********************
iteration number  6 





accuracy after gender removal is 0.6281666666666667
124
404
The difference of means is  0.003179837817053128
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.39428178690235394   ---  effectSize:  0.11952361233667877
The difference of means is  0.01176219288295804
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.006919951007544056   ---  effectSize:  0.8249310828987915
The difference of means is  0.0566158971998438
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.004943292646669151   ---  effectSize:  1.3796802664029382
*********************
iteration number  7 





accuracy after gender removal is 0.606
124
404
The difference of means is  0.0021334226254885857
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.4285064223565771   ---  effectSize:  0.07928527995846239
The difference of means is  0.012187894541969453
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005675990842616163   ---  effectSize:  0.8419772383437013
The difference of means is  0.056895861509258815
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.004791267041401093   ---  effectSize:  1.380991758618423
*********************
iteration number  8 





accuracy after gender removal is 0.5976666666666667
124
404
The difference of means is  0.003464581154287851
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.3868079442250082   ---  effectSize:  0.12949659063722416
The difference of means is  0.011859926187113103
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.006356048625986488   ---  effectSize:  0.8277360589974977
The difference of means is  0.057209563742980286
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.004852578134907581   ---  effectSize:  1.3818651179038342
*********************
iteration number  9 





accuracy after gender removal is 0.5805
124
404
The difference of means is  0.003517586579932855
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.3865681269682778   ---  effectSize:  0.13041027858504806
The difference of means is  0.011899442690601765
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.006620595550754671   ---  effectSize:  0.8249529384815293
The difference of means is  0.0571484369672914
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.0049362649165497485   ---  effectSize:  1.3825786801673257
*********************
iteration number  10 





accuracy after gender removal is 0.5853333333333334
124
404
The difference of means is  0.003367865898746495
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.3939321393931795   ---  effectSize:  0.12324257235387512
The difference of means is  0.011999368839625008
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.006636600757683797   ---  effectSize:  0.8238397710214367
The difference of means is  0.056295438571022635
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005344728091979722   ---  effectSize:  1.365004129540242
*********************
iteration number  11 





accuracy after gender removal is 0.5795
124
404
The difference of means is  0.003388865165104943
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.3914636934807608   ---  effectSize:  0.12211325372035281
The difference of means is  0.01211374685829217
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.007054162423266619   ---  effectSize:  0.8196768735069356
The difference of means is  0.05606057824925054
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.00567361222779772   ---  effectSize:  1.3489850236386594
*********************
iteration number  12 





accuracy after gender removal is 0.5618333333333333
124
404
The difference of means is  0.0033918151012390565
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.38995231702575983   ---  effectSize:  0.12190705362828262
The difference of means is  0.012132738466448816
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.006943774800791225   ---  effectSize:  0.8189758850193833
The difference of means is  0.05603032441211484
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005917506739514078   ---  effectSize:  1.3463622403672888
*********************
iteration number  13 





accuracy after gender removal is 0.5608333333333333
124
404
The difference of means is  0.0034093437051373066
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.3935405685030462   ---  effectSize:  0.12044578602025095
The difference of means is  0.012279325511086367
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.0072222209570789175   ---  effectSize:  0.8146025537182112
The difference of means is  0.05650139203792911
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.0060744884691419365   ---  effectSize:  1.3405351672240522
*********************
iteration number  14 





accuracy after gender removal is 0.5666666666666667
124
404
The difference of means is  0.003224252276834325
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.3983290747008863   ---  effectSize:  0.11407050097959308
The difference of means is  0.011887862539701404
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.008877990516956369   ---  effectSize:  0.7918256243973013
The difference of means is  0.0584693819202583
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.0052727400019489945   ---  effectSize:  1.3616172896472065
*********************
iteration number  15 





accuracy after gender removal is 0.555
124
404
The difference of means is  0.003134647717840601
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.40163326901898644   ---  effectSize:  0.11099917830613758
The difference of means is  0.01144885573706677
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.011217039130356765   ---  effectSize:  0.7615561956283821
The difference of means is  0.06040929358317902
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005114117039754951   ---  effectSize:  1.3727416030055093
*********************
iteration number  16 





accuracy after gender removal is 0.5453333333333333
124
404
The difference of means is  0.003199547946186177
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.4025497656978727   ---  effectSize:  0.11066185151735743
The difference of means is  0.011426573721173526
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.012951282903065686   ---  effectSize:  0.742896769263705
The difference of means is  0.06106313782779798
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.0057358126387714314   ---  effectSize:  1.356288157015284
*********************
iteration number  17 





accuracy after gender removal is 0.5378333333333334
124
404
The difference of means is  0.0029616641628953076
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.40985022089677015   ---  effectSize:  0.10175688819096318
The difference of means is  0.010955127507959093
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.01833463772735755   ---  effectSize:  0.696445791732212
The difference of means is  0.060993997085117324
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005603687108212352   ---  effectSize:  1.3590231773601853
*********************
iteration number  18 





accuracy after gender removal is 0.5215
124
404
The difference of means is  0.002306030183047734
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.42913538961677966   ---  effectSize:  0.0799923338786896
The difference of means is  0.010098429691342151
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.028915218777031848   ---  effectSize:  0.6329143382007165
The difference of means is  0.060236424982647786
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005653404632656178   ---  effectSize:  1.3498087193816701
*********************
iteration number  19 





accuracy after gender removal is 0.5058333333333334
124
404
The difference of means is  0.0032388394240550535
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.4020579546973977   ---  effectSize:  0.11031868238168052
The difference of means is  0.011226373722360612
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.018123364995379254   ---  effectSize:  0.6972137921089494
The difference of means is  0.060296769111376915
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.006053168574165491   ---  effectSize:  1.3435808191266143
*********************


In [23]:
for i in range(len(gonen_diff)):
    print(d_gens[i], ",", p_gens[i], ",", accur[i], ",", gonen_same[i], ",", gonen_diff[i])

0.49749811008198386 , 0.06842652533483218 , 0.9373333333333334 , 0.4759957738647015 , 0.42397417806738413
0.7364592317492646 , 0.013526353507255617 , 0.813 , 0.46946428890156494 , 0.43036314143690013
0.7667552098055803 , 0.010871826014438257 , 0.7446666666666667 , 0.46838336303233624 , 0.4316912926880681
0.7779945852433037 , 0.009766085217833953 , 0.7035 , 0.46791312160514303 , 0.43224735713516166
0.7793618217774235 , 0.009796503762990905 , 0.6765 , 0.46753772400099436 , 0.43239481219510656
0.7867202352797165 , 0.009131637703401085 , 0.6443333333333333 , 0.4673704119158574 , 0.43248143814531975
0.8249310828987915 , 0.006919951007544056 , 0.6281666666666667 , 0.4670503117847891 , 0.43245717878962586
0.8419772383437013 , 0.005675990842616163 , 0.606 , 0.4660770123731255 , 0.4315014002780017
0.8277360589974977 , 0.006356048625986488 , 0.5976666666666667 , 0.46518719550562504 , 0.43058320696218216
0.8249529384815293 , 0.006620595550754671 , 0.5805 , 0.4635846226192044 , 0.4292473268621692


In [24]:
result = perform_weat(flo, ins, plez, unplez, WEAT_gender_removed_FR_embeddings_2, my_word2id)

result = perform_weat(instr, wep, plez, unplez, WEAT_gender_removed_FR_embeddings_2, my_word2id)

The difference of means is  0.06730817739790962
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.00035638268290638653   ---  effectSize:  1.6020288085363177
The difference of means is  0.0652660405423842
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  3.776905344032855e-08   ---  effectSize:  1.6530537618084034


In [25]:
result = perform_weat(sci, hum, man, wom, WEAT_gender_removed_FR_embeddings_2, my_word2id)

result = perform_weat(car, fam, boy, girl, WEAT_gender_removed_FR_embeddings_2, my_word2id)

The difference of means is  0.011226373722360612
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.01848432833779967   ---  effectSize:  0.6972137921089494
The difference of means is  0.060296769111376915
Generating null distribution...
Number of permutations  100000
Getting the entire distribution
p-value:  0.005855268095038668   ---  effectSize:  1.3435808191266143


# Single- category GG-WEAT

In [26]:
attributesFirstSet = man
attributesSecondSet = wom

In [27]:
fem_n = []
masc_n = []
counter = 0

#obtaining the first 1,000 inanimate nouns for single category gg-weat
for pair in shortened_3000:
    
    fem_n.append(pair[0])
    masc_n.append(pair[1])
    
    counter += 1
    if counter == 1000:
        break

def write_wefat_results(myFileName, d_before, d_after):
    f = open(myFileName, "w+")
    for i in range(len(d_before)):
        f.write(str(d_before[i])+ "\t" + str(d_after[i]) + "\n")
    f.close()
    
    
import wefat_utils
from statistics import mean

single-category gg-weat for masculine inanimate nouns

In [28]:
nouns = masc_n
genderList = ["masc"] * len(nouns)

results = wefat_utils.mean_concept_stereotype(attributesFirstSet, attributesSecondSet, nouns,
                                                my_embeddings, my_word2id, WEAT_gender_removed_FR_embeddings_2, my_word2id)

meanConceptStereotype1 = results[0]
meanConceptStereotype2 = results[1]
meanConceptStereotype1_after = results[2]
meanConceptStereotype2_after = results[3]
bothStereotypes = attributesFirstSet + attributesSecondSet

In [29]:
conceptNullMatrix = wefat_utils.computeNullMatrix(nouns, bothStereotypes, my_embeddings, my_word2id)
conceptNullMatrix_after = wefat_utils.computeNullMatrix(nouns, bothStereotypes, WEAT_gender_removed_FR_embeddings_2, my_word2id)
p_before, p_after, d_before, d_after = wefat_utils.wefat(nouns,conceptNullMatrix, conceptNullMatrix_after,my_embeddings, my_word2id,WEAT_gender_removed_FR_embeddings_2, my_word2id,bothStereotypes, meanConceptStereotype1, meanConceptStereotype2,meanConceptStereotype1_after,meanConceptStereotype2_after,attributesFirstSet,attributesSecondSet, genderList, 10000)

WEFAT for word    Kurs gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  0.7120437778086647     after:  -0.36708771056764017
p val:       before:  0.07694582129677614      after:  0.7687791918528611
0


WEFAT for word    Konkurs gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  0.4197355007960686     after:  -0.059381405326884455
p val:       before:  0.19146154741551846      after:  0.5502739187078057
1


WEFAT for word    Mittwoch gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  1.0519868122068061     after:  -0.23300472528482002
p val:       before:  0.01638144776945915      after:  0.686045470846549
2


WEFAT for word    us-dollar gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  -0.28940898544416477     after:  -1.0248683273344557
p val:       before:  0.7205591276536846     

Number of permutations  10000
effect size:     before:  0.8152174560788301     after:  -0.1838821705249034
p val:       before:  0.0541107981839659      after:  0.6401794898845656
32


WEFAT for word    Ausgang gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  1.2402938597981392     after:  0.09727297777307424
p val:       before:  0.0067487843489914155      after:  0.4221890012901207
33


WEFAT for word    Fernsehsender gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  1.0673683435112167     after:  0.3154726526141818
p val:       before:  0.01693954034394097      after:  0.2670500819670217
34


WEFAT for word    Kanälen gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  -0.00651139000725907     after:  -0.635947856117536
p val:       before:  0.5074547208532059      after:  0.8996083335588256
35


WEFAT for word    Motorsport gende

Number of permutations  10000
effect size:     before:  0.45434514450184454     after:  -0.6056121368502826
p val:       before:  0.18268322647911905      after:  0.8845126935726126
64


WEFAT for word    Markt gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  0.6666834315063352     after:  -0.34299282804551573
p val:       before:  0.09170097342427397      after:  0.7511834510437864
65


WEFAT for word    Feiertagen gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  0.13085243190466087     after:  -0.3811565642512682
p val:       before:  0.3928562365920182      after:  0.7748268513854916
66


WEFAT for word    Service gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  0.999612945183502     after:  0.03448582992991328
p val:       before:  0.02368484776437675      after:  0.46406947350428374
67


WEFAT for word    Softwaretests gende

Number of permutations  10000
effect size:     before:  1.3639182095701938     after:  -0.012959107808959714
p val:       before:  0.003158147194057417      after:  0.5127075938385659
96


WEFAT for word    Mitbenutzer gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  1.0230050021603034     after:  0.2816050682241033
p val:       before:  0.020976086100264548      after:  0.28555130713759863
97


WEFAT for word    Pfennig gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  0.7592957071555729     after:  -0.25289013806688954
p val:       before:  0.06482474914396584      after:  0.6958862938532935
98


WEFAT for word    August gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  1.3818933425473023     after:  0.13072259368890327
p val:       before:  0.0030500626812797116      after:  0.4054726198701554
99


WEFAT for word    Kopierschutz

Number of permutations  10000
effect size:     before:  1.16378573435408     after:  0.3836697619810614
p val:       before:  0.010088606406460716      after:  0.22170939058925304
128


WEFAT for word    Rückschlägen gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  0.4798432943048096     after:  -0.1876632740494291
p val:       before:  0.1683458647775059      after:  0.6477697089605141
129


WEFAT for word    Anfang gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  1.124848739873927     after:  0.2780198849574738
p val:       before:  0.012190567989732101      after:  0.28470243383695326
130


WEFAT for word    Leistungsumfang gender:    masc
Number of permutations  10000
Number of permutations  10000
effect size:     before:  0.9079007770702541     after:  -0.2379493904625451
p val:       before:  0.034971271218774996      after:  0.6868398194799372
131


WEFAT for word    Beträge

KeyboardInterrupt: 

In [None]:
def check_wefat(gender, d_before, d_after):
    correct = 0
    for i in range(len(d_after)):
        if gender == "fem":
            if d_before[i] < d_after[i]:
                correct += 1
        elif gender == "masc":
            if d_before[i] > d_after[i]:
                correct += 1
    acc = correct/len(d_after)    
    print("accuracy is ", correct ,"/", len(d_after), " or ", acc)
    
            
                
check_wefat("masc", d_before, d_after)    

write_wefat_results(lang+ "_wefat_masc.txt", d_before, d_after)




In [None]:
print(mean(d_before))
print(mean(d_after))

In [None]:
nouns = fem_n
genderList = ["fem"] * 1000

results = wefat_utils.mean_concept_stereotype(attributesFirstSet, attributesSecondSet, nouns,
                                                my_embeddings, my_word2id, WEAT_gender_removed_FR_embeddings_2, my_word2id)

meanConceptStereotype1 = results[0]
meanConceptStereotype2 = results[1]
meanConceptStereotype1_after = results[2]
meanConceptStereotype2_after = results[3]
bothStereotypes = attributesFirstSet + attributesSecondSet

conceptNullMatrix = wefat_utils.computeNullMatrix(nouns, bothStereotypes, my_embeddings, my_word2id)
conceptNullMatrix_after = wefat_utils.computeNullMatrix(nouns, bothStereotypes, WEAT_gender_removed_FR_embeddings_2, my_word2id)
p_before, p_after, d_before, d_after = wefat_utils.wefat(nouns,conceptNullMatrix, conceptNullMatrix_after,my_embeddings, my_word2id,WEAT_gender_removed_FR_embeddings_2, my_word2id,bothStereotypes, meanConceptStereotype1, meanConceptStereotype2,meanConceptStereotype1_after,meanConceptStereotype2_after,attributesFirstSet,attributesSecondSet, genderList, 10000)


In [None]:
check_wefat("fem", d_before, d_after) 
write_wefat_results(lang+ "_wefat_fem.txt", d_before, d_after)

In [None]:
print(mean(d_before))
print(mean(d_after))

# Writing tsv files for Embedding Visualization in tensorflow projector

In [None]:
#generates a tsv file for visualizing embeddings
def generateTSVfile(myFileName, wv, word2index,word_list):
    f = open(myFileName, "w+")
    for word in word_list:
        row = wv[word2index[word]]
        for j in range(0, len(row) - 1):
            f.write("%f\t" % row[j])
        f.write("\n")
    f.close()
    
def generateLabelFile_words(labelFileName, word_list):
    f = open(labelFileName, "w+")
    for word in word_list:
        if word == '':
            word = 'NULL'

        f.write("%s\n" % word)
    f.close()

    print ("files written")
    
def generateLabelFile_color(labelFileName, word_list_1, word_list_2):
    f = open(labelFileName, "w+")
    for word in word_list_1:
        if word == '':
            word = 'NULL'

        f.write("1 \n")
    for word in word_list_1:
        if word == '':
            word = 'NULL'

        f.write("2 \n")
    f.close()

    print ("files written")

In [None]:
# both_nouns = FR_expanded_f_nouns[:2990] + FR_expanded_m_nouns[:2990]
# generateTSVfile("emb-pl.tsv", WEAT_gender_removed_FR_embeddings_2, my_word2id, both_nouns)
# generateLabelFile_color("label-pl.tsv", FR_expanded_f_nouns[:2990] , FR_expanded_m_nouns[:2990] )

In [None]:
# generateTSVfile("emb-pl.tsv", my_embeddings, my_word2id, both_nouns)
# generateLabelFile_color("label-pl.tsv", FR_expanded_f_nouns[:2990] , FR_expanded_m_nouns[:2990] )

# ValNorm Computations

In [None]:
val_file = "data/my_" + lang +".csv"
valnorm_word_list = read_vocab(val_file)
semanticModel = prep_input(my_word2id, my_embeddings)
WEFAT_Results_1 = valnorm.WordEmbeddingFactualAssociationTestVocab(semanticModel,valnorm_word_list, lang)
WEFAT_Results_1.to_csv("valnorm.csv")

In [None]:
calcValNorm.calculate_valNorm("valnorm.csv",val_file)
os.remove("valnorm.csv") 

In [None]:
semanticModel = prep_input(my_word2id, WEAT_gender_removed_FR_embeddings_2)
WEFAT_Results_1 = valnorm.WordEmbeddingFactualAssociationTestVocab(semanticModel,valnorm_word_list, lang)
WEFAT_Results_1.to_csv("valnorm.csv")

calcValNorm.calculate_valNorm("valnorm.csv",val_file)
os.remove("valnorm.csv") 

Removing grammatical gender from all words used in ValNorm

# Analogy Task

In [None]:
def solve_analogy(word_1, word_2, word_3, word_4, src_emb, src_id2word, word2id, K=10):
    
    if word_1 not in word2id:
        return "NULL"
    if word_2 not in word2id:
        return "NULL"
    if word_3 not in word2id:
        return "NULL"
    if word_4 not in word2id:
        return "NULL"
    
    emb1 = src_emb[word2id[word_1]]
    emb2 = src_emb[word2id[word_2]]
    emb3 = src_emb[word2id[word_3]]
    
    
    word_emb = (emb2 - emb1) + emb3
    scores = (src_emb/ np.linalg.norm(src_emb, 2, 1)[:, None]).dot(word_emb / np.linalg.norm(word_emb))
    k_best = scores.argsort()[-K:][::-1]
    answer = ""
    for i, idx in enumerate(k_best):
#         print('%.4f - %s' % (scores[idx], src_id2word[idx]))
        if ((word_1.lower() not in src_id2word[idx].lower()) and ( word_2.lower() not in src_id2word[idx].lower())) and (word_3.lower() not in src_id2word[idx].lower()):
            answer = src_id2word[idx]
            break
    return answer
    
def read_test(file_name, my_embeddings, my_id2word, my_word2id):
    correct = 0
    in_vocab = 0
    not_in_vocab = 0
    with io.open(file_name, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        for i, line in enumerate(f):
            word_1, word_2, word_3, word_4 = line.rstrip().split(' ', 3)
            answer = solve_analogy(word_1, word_2, word_3, word_4, my_embeddings, my_id2word, my_word2id)
            print(i, word_1, word_2, word_3, word_4, answer)
            if answer == "NULL":
                not_in_vocab += 1
            else:
                in_vocab += 1
                if answer == word_4:
                    print("here")
                    correct += 1
    print("not in vocab: ", not_in_vocab)
    print("in vocab: ", in_vocab)
    print("correct: ", correct)
    return correct/(in_vocab)
             
    
    

In [None]:
accuracy_2 = read_test("data/eval/"+lang+"-test.txt",my_embeddings, my_id2word, my_word2id)
print(accuracy_2)