In [None]:
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx
import numpy as np

import data
import model as modelScript
import pickle
import json

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn import metrics
from sklearn.cluster import KMeans

In [None]:
cuda = True
seed = 20190328

# Set the random seed manually for reproducibility.
torch.manual_seed(seed)
if torch.cuda.is_available():
    if not cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

device = torch.device("cuda" if cuda else "cpu")

In [None]:
dataDir = './cda_wikitext/cda/'
corpus = data.Corpus(dataDir)

In [None]:
MALE_NOUNS = [
    'gentleman', 'man', 'men', 'gentlemen', 'male', 'males', 'boy', 'boyfriend', 'actor',\
    'prince', 'king', 'kings','uncle', \
    'groom', 'widower', 'grandson', 'grandfather','monk',
    'boys', 'he', 'his', 'him', 'husband', 'husbands', 'son' , 'sons', \
    'brother', 'brothers','himself'
]

FEMALE_NOUNS = [
    'woman', 'women', 'ladies', 'female', 'females', 'girl', 'girlfriend', 'actress', \
    'princess', 'queen', 'queens','aunt', 'bride', 'blonde', 'widow', 'granddaughter', \
    'grandmother', 'granny',
    'girlfriends', 'girls', 'her', 'hers', 'lady', 'she', 'wife', 'wives', 'daughter', 'daughters', \
    'sister', 'sisters', 'herself'
]

In [None]:
# femaleBiased = ['petite', 'mums', 'bra', 'breastfeeding', 'sassy']
# maleBiased = ['rookie', 'burly', 'hero', 'training camp', 'journeyman']

In [None]:
femaleBiasWords = "commissioning eggs pregnancy beautiful alien rape damaged clothing dancing badly wear baby \
singers artist fired sound dress marry wedding dressed sex Mary independent artists chorus assisted chicks"
femaleBiasWords = femaleBiasWords.split()
len(femaleBiasWords)

In [None]:
maleBiasWords = "travel teammate squad senior secured salary reign promotion policies officer leadership \
hosts hat guitar governors goals generals driver doctoral defend defeat deputy chief conservative colleagues \
chair bull battalion bat authority appointment ally Museum Academy match succeeded speech retirement selected \
heir kingdom camp commander University financial rival castle crown studies manager informed religion arrested"
maleBiasWords = maleBiasWords.split()
len(maleBiasWords)

In [None]:
words = femaleBiasWords + maleBiasWords
labels = [0 for w in femaleBiasWords] + [1 for w in maleBiasWords]

In [None]:
def clusterPerformance():
    word_indexes = torch.tensor([corpus.dictionary.word2idx[w] for w in words]).to(device)
    vectors = model.encoder(word_indexes)
    vectors = vectors.cpu().detach().numpy()
    print(vectors.shape)
    kmeans = KMeans(n_clusters=2)
    kmeans.fit(vectors)
    p = sum(kmeans.labels_ == labels)/ len(labels)
    return max(p,1-p)

In [None]:
modelDir = ''
modelFiles = [m for m in os.listdir(modelDir) if m.endswith('.pt')]
modelFiles

In [None]:
save = modelDir + modelFiles[3]

In [None]:
with open(save, 'rb') as f:
        model = torch.load(f)
        # after load the rnn params are not a continuous chunk of memory
        # this makes them a continuous chunk, and will speed up forward pass
        model.rnn.flatten_parameters()

In [None]:
clusterPerformance()

In [None]:
word_indexes = torch.tensor([corpus.dictionary.word2idx[w] for w in words]).to(device)
male_indexes = torch.tensor([corpus.dictionary.word2idx[w] for w in MALE_NOUNS]).to(device)
female_indexes = torch.tensor([corpus.dictionary.word2idx[w] for w in FEMALE_NOUNS]).to(device)

In [None]:
with torch.no_grad():
    word_vectors = model.encoder(word_indexes)
    male_vectors = model.encoder(male_indexes)
    female_vectors = model.encoder(female_indexes)

In [None]:
def distance(v1,v2):
    return torch.pow(torch.sum(torch.pow(v1-v2,2)),0.5).item()

In [None]:
m_a_d = []
f_a_d = []
for w in  word_vectors:
    m_d = []
    for m in male_vectors:
        m_d.append(distance(w,m))
    m_a_d.append(sum(m_d)/len(m_d))
    f_d = []
    for f in female_vectors:
        f_d.append(distance(w,f))
    f_a_d.append(sum(f_d)/len(f_d))    

In [None]:
femaleMaleAvgDist = [m for m,l in zip(m_a_d,labels) if l == 0]
femaleMaleAvgDist = sum(femaleMaleAvgDist) / len(femaleMaleAvgDist)
femaleMaleAvgDist

In [None]:
maleMaleAvgDist = [m for m,l in zip(m_a_d,labels) if l == 1]
maleMaleAvgDist = sum(maleMaleAvgDist) / len(maleMaleAvgDist)
maleMaleAvgDist

In [None]:
femaleMaleAvgDist = [m for m,l in zip(m_a_d,labels) if l == 0]
femaleMaleAvgDist = sum(femaleMaleAvgDist) / len(femaleMaleAvgDist)
femaleMaleAvgDist

In [None]:
maleMaleAvgDist = [m for m,l in zip(m_a_d,labels) if l == 1]
maleMaleAvgDist = sum(maleMaleAvgDist) / len(maleMaleAvgDist)
maleMaleAvgDist