In [2]:
import sys
sys.path.append("../src")
sys.path.append("../data/embeddings")
sys.path.append("../data/embeddings/biasbios")
import classifier
import svm_classifier
import debias
import gensim
import codecs
import json
from gensim.models.keyedvectors import Word2VecKeyedVectors
from gensim.models import KeyedVectors
import numpy as np
import random
import sklearn
from sklearn import model_selection
from sklearn import cluster
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import DictVectorizer
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM

import scipy
from scipy import linalg
from scipy import sparse
from scipy.stats.stats import pearsonr
import tqdm
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier, SGDRegressor, Perceptron, LogisticRegression

%matplotlib inline
matplotlib.rcParams['agg.path.chunksize'] = 10000

import warnings
warnings.filterwarnings("ignore")

import pickle
from collections import defaultdict, Counter
from typing import List, Dict

import torch
from torch import utils

import pytorch_lightning as pl
from pytorch_lightning import Trainer
import copy

import seaborn as sn
import pandas as pd

In [3]:
def load_data(fname = "../data/BIOS.pkl"):
    with open(fname, "rb") as f:
        return pickle.load(f)
    
def count_profs_and_gender(data: List[dict]):
    
    counter = defaultdict(Counter)
    for entry in data:
        gender, prof = entry["gender"], entry["raw_title"]
        counter[prof.lower()][gender.lower()] += 1
        
    return counter

def filter_dataset(data, topk = 10):
    
    filtered = []
    counter = count_profs_and_gender(data)
    total_counts = [(prof, counter[prof]["f"] + counter[prof]["m"]) for prof in counter.keys()]
    profs_by_frq = sorted(total_counts, key = lambda x: -x[1])
    topk_profs = [p[0] for p in profs_by_frq[:topk]]
    
    print("Top-k professions: {}".format(topk_profs))
    for d in data:
        
        if d["raw_title"].lower() in topk_profs:
            filtered.append(d)
    
    return filtered
    
def split_train_dev_test(data):
    
    g2i, i2g = {"m": 0, "f": 1}, {1: "f", 0: "m"}
    all_profs = list(set([d["raw_title"].lower() for d in data]))
    all_words = []
    for d in data:
        all_words.extend(d["raw"].split(" "))
    
    all_words = set(all_words)
    all_words.add("<UNK>")
    
    p2i = {p:i for i,p in enumerate(sorted(all_profs))}
    i2p = {i:p for i,p in enumerate(sorted(all_profs))}
    w2i = {w:i for i,w in enumerate(sorted(all_words))}
    i2w = {i:w for i,w in enumerate(sorted(all_words))}
    
    all_data = []
    for entry in tqdm.tqdm(data, total = len(data)):
        gender, prof = entry["gender"].lower(), entry["raw_title"].lower()
        raw, start_index = entry["raw"], entry["start_pos"]
        all_data.append({"g": g2i[gender], "p": p2i[prof], "text": raw, "start": start_index})


    train_dev, test = sklearn.model_selection.train_test_split(all_data, test_size = 0.2, random_state = 0)
    train, dev = sklearn.model_selection.train_test_split(train_dev, test_size = 0.3, random_state = 0)
    print("Train size: {}; Dev size: {}; Test size: {}".format(len(train), len(dev), len(test)))
    return (train, dev, test), (g2i, i2g, p2i, i2p, w2i, i2w)


In [4]:
data = load_data()
counter = count_profs_and_gender(data)
data = filter_dataset(data, topk = 60)
(train, dev, test), (g2i, i2g, p2i, i2p, w2i, i2w) = split_train_dev_test(data)

Top-k professions: ['associate professor', 'assistant professor', 'physician', 'attorney', 'photographer', 'journalist', 'nurse', 'teacher', 'psychologist', 'architect', 'dentist', 'painter', 'poet', 'model', 'filmmaker', 'composer', 'software engineer', 'surgeon', 'comedian', 'dietitian', 'accountant', 'psychotherapist', 'pastor', 'orthopedic surgeon', 'trial lawyer', 'chiropractor', 'plastic surgeon', 'trial attorney', 'paralegal', 'senior software engineer', 'interior designer', 'dj', 'rapper', 'certified public accountant', 'cpa', 'neurosurgeon', 'yoga teacher', 'nutritionist', 'personal trainer', 'certified personal trainer', 'yoga instructor']


100%|██████████| 236263/236263 [00:00<00:00, 840322.61it/s]


Train size: 132307; Dev size: 56703; Test size: 47253


In [5]:
counter

defaultdict(collections.Counter,
            {'assistant professor': Counter({'f': 18061, 'm': 20401}),
             'certified public accountant': Counter({'m': 514, 'f': 292}),
             'journalist': Counter({'f': 6514, 'm': 6733}),
             'architect': Counter({'m': 4807, 'f': 1392}),
             'photographer': Counter({'m': 10230, 'f': 5301}),
             'psychologist': Counter({'m': 3605, 'f': 5350}),
             'teacher': Counter({'f': 5658, 'm': 3982}),
             'nurse': Counter({'f': 10303, 'm': 1022}),
             'associate professor': Counter({'f': 16243, 'm': 22294}),
             'attorney': Counter({'f': 6857, 'm': 10430}),
             'software engineer': Counter({'m': 2896, 'f': 520}),
             'trial attorney': Counter({'m': 753, 'f': 241}),
             'painter': Counter({'f': 2149, 'm': 2548}),
             'physician': Counter({'m': 8291, 'f': 11752}),
             'chiropractor': Counter({'m': 897, 'f': 365}),
             'personal traine

### get input representatons 

In [6]:
def load_word_vectors(fname = "../data/embeddings/vecs.filtered.with_gendered.glove.txt"):
    
    model = KeyedVectors.load_word2vec_format(fname, binary=False)
    vecs = model.vectors
    words = list(model.vocab.keys())
    return model, vecs, words

def get_embeddings_based_dataset(data: List[dict], word2vec_model):
    
    X, Y = [], []
    unk, total = 0., 0.
    
    for entry in tqdm.tqdm(data, total = len(data)):
        text, start, y = entry["text"], entry["start"], entry["p"]
        #text = text.lower()
        words = text[start + 1:].split(" ")
        #print(text)
        #print("----------")
        #print(" ".join(words))
        #print("=====================")
        bagofwords = np.sum([word2vec_model[w] if w in word2vec_model else word2vec_model['unk'] for w in words], axis = 0)
        X.append(bagofwords)
        Y.append(y)
        total += len(words)
        unk += len([w for w in words if w not in word2vec_model])
    
    print("% unknown: {}".format(unk/total))
    return X,Y

def get_BOW_based_dataset(data: List[dict], w2i):
    
    vectorizer = DictVectorizer(sparse = True)
    X, Y = [], []
    unk, total = 0., 0.
    data_dicts = []
    
    for entry in tqdm.tqdm(data, total = len(data)):
        text, start, y = entry["text"], entry["start"], entry["p"]
        text = text.lower()
        words = text[start + 1:].split(" ")
        entry_dict = {w:w2i[w] if w in w2i else w2i["<UNK>"] for w in words}
        data_dicts.append(entry_dict)
        Y.append(y)
        
        total += len(words)
        unk += len([w for w in words if w not in w2i])
    
    print("% unknown: {}".format(unk/total))
    X = vectorizer.fit_transform(data_dicts)
    return X,Y
    
def get_bert_based_dataset(data: List[dict]):
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').cuda()
    model.eval()
    
    X, Y = [], []
    
    for entry in tqdm.tqdm(data, total = len(data)):
        text, start, y = entry["text"], entry["start"], entry["p"]
        #text = text.lower()            
        tokenized_text = tokenizer.tokenize(sentence_str)    
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        with torch.no_grad():
              outputs = model(tokens_tensor) 
            

In [7]:
word2vec, vecs, words = load_word_vectors("../data/embeddings/wiki-news-300d-1M-subword.vec") #load_word2vec()
#word2vec.init_sims(replace = True)
#X_train, Y_train = get_BOW_based_dataset(train, w2i)
#X_dev, Y_dev = get_BOW_based_dataset(dev, w2i) 
X_train, Y_train = get_embeddings_based_dataset(train, word2vec)
X_dev, Y_dev =  get_embeddings_based_dataset(dev, word2vec)

100%|██████████| 132307/132307 [00:22<00:00, 5880.23it/s]
  2%|▏         | 1186/56703 [00:00<00:09, 5925.31it/s]

% unknown: 0.10565114291350908


100%|██████████| 56703/56703 [00:09<00:00, 5702.98it/s]

% unknown: 0.10572557408218906





In [None]:
X_dev[1].shape

In [8]:
#clf = LinearSVC(verbose = 10) #LogisticRegression()
clf = LogisticRegression(solver = "sag", multi_class = 'ovr', verbose = 10, n_jobs = 32)
clf.fit(X_train, Y_train)
print(clf.score(X_dev, Y_dev))

[Parallel(n_jobs=32)]: Using backend ThreadingBackend with 32 concurrent workers.


max_iter reached after 1222 seconds
max_iter reached after 1231 seconds
max_iter reached after 1231 seconds


[Parallel(n_jobs=32)]: Done   3 out of  41 | elapsed: 20.5min remaining: 260.0min


max_iter reached after 1232 seconds
max_iter reached after 1233 seconds
max_iter reached after 1233 seconds
max_iter reached after 1233 seconds
max_iter reached after 1234 seconds


[Parallel(n_jobs=32)]: Done   8 out of  41 | elapsed: 20.6min remaining: 84.8min


max_iter reached after 1234 seconds
max_iter reached after 1235 seconds
max_iter reached after 1235 seconds
max_iter reached after 1235 seconds
max_iter reached after 1236 seconds


[Parallel(n_jobs=32)]: Done  13 out of  41 | elapsed: 20.6min remaining: 44.4min


max_iter reached after 1238 seconds
max_iter reached after 1238 seconds
max_iter reached after 1238 seconds
max_iter reached after 1238 seconds
max_iter reached after 1239 seconds


[Parallel(n_jobs=32)]: Done  18 out of  41 | elapsed: 20.6min remaining: 26.4min


max_iter reached after 1239 seconds
max_iter reached after 1239 seconds
max_iter reached after 1239 seconds
max_iter reached after 1239 seconds
max_iter reached after 1240 seconds
max_iter reached after 1240 seconds


[Parallel(n_jobs=32)]: Done  23 out of  41 | elapsed: 20.7min remaining: 16.2min


max_iter reached after 1240 seconds
max_iter reached after 1241 seconds
max_iter reached after 1241 seconds
max_iter reached after 1241 seconds


[Parallel(n_jobs=32)]: Done  28 out of  41 | elapsed: 20.7min remaining:  9.6min


max_iter reached after 1242 seconds
max_iter reached after 1243 seconds
max_iter reached after 1246 seconds
max_iter reached after 1248 seconds
max_iter reached after 322 seconds
max_iter reached after 334 seconds


[Parallel(n_jobs=32)]: Done  33 out of  41 | elapsed: 25.9min remaining:  6.3min


max_iter reached after 326 seconds
max_iter reached after 325 seconds
max_iter reached after 324 seconds
max_iter reached after 327 seconds
max_iter reached after 325 seconds
max_iter reached after 325 seconds


[Parallel(n_jobs=32)]: Done  38 out of  41 | elapsed: 26.0min remaining:  2.0min


max_iter reached after 326 seconds


[Parallel(n_jobs=32)]: Done  41 out of  41 | elapsed: 26.0min finished


0.5980283230164188


In [9]:
print(clf.score(X_train, Y_train))

0.6056595644977212


In [10]:
y_hat = clf.predict(X_dev)
cm = sklearn.metrics.confusion_matrix(Y_dev,y_hat)
labels = [i2p[i] for i in range(len(i2p))]

In [None]:


df_cm = pd.DataFrame(cm, index = labels, columns = labels)
#plt.figure(figsize = (10,7))
sn.set(font_scale=0.3)#for label size
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True, fmt='g')
plt.savefig("confusion.png", dpi = 600)
plt.show()