In [1]:
import sys
sys.path.append("../src")
sys.path.append("../data/embeddings")
sys.path.append("../data/embeddings/biasbios")
import classifier
import svm_classifier
import debias
import gensim
import codecs
import json
from gensim.models.keyedvectors import Word2VecKeyedVectors
from gensim.models import KeyedVectors
import numpy as np
import random
import sklearn
from sklearn import model_selection
from sklearn import cluster
from sklearn import metrics
from sklearn.manifold import TSNE
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction import DictVectorizer
from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM

import scipy
from scipy import linalg
from scipy import sparse
from scipy.stats.stats import pearsonr
import tqdm
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier, SGDRegressor, Perceptron, LogisticRegression

%matplotlib inline
matplotlib.rcParams['agg.path.chunksize'] = 10000

import warnings
warnings.filterwarnings("ignore")

import pickle
from collections import defaultdict, Counter
from typing import List, Dict

import torch
from torch import utils

import pytorch_lightning as pl
from pytorch_lightning import Trainer
import copy

In [2]:
def load_data(fname = "../data/data.biosbias.pickle"):
    with open(fname, "rb") as f:
        return pickle.load(f)
    
def count_profs_and_gender(data: List[dict]):
    
    counter = defaultdict(Counter)
    for entry in data:
        gender, prof = entry["gender"], entry["raw_title"]
        counter[prof.lower()][gender.lower()] += 1
        
    return counter

def filter_dataset(data, topk = 10):
    
    filtered = []
    counter = count_profs_and_gender(data)
    total_counts = [(prof, counter[prof]["f"] + counter[prof]["m"]) for prof in counter.keys()]
    profs_by_frq = sorted(total_counts, key = lambda x: -x[1])
    topk_profs = [p[0] for p in profs_by_frq[:topk]]
    
    print("Top-k professions: {}".format(topk_profs))
    for d in data:
        
        if d["raw_title"].lower() in topk_profs:
            filtered.append(d)
    
    return filtered
    
def split_train_dev_test(data):
    
    g2i, i2g = {"m": 0, "f": 1}, {1: "f", 0: "m"}
    all_profs = list(set([d["raw_title"].lower() for d in data]))
    all_words = []
    for d in data:
        all_words.extend(d["raw"].split(" "))
    
    all_words = set(all_words)
    all_words.add("<UNK>")
    
    p2i = {p:i for i,p in enumerate(sorted(all_profs))}
    i2p = {i:p for i,p in enumerate(sorted(all_profs))}
    w2i = {w:i for i,w in enumerate(sorted(all_words))}
    i2w = {i:w for i,w in enumerate(sorted(all_words))}
    
    all_data = []
    for entry in tqdm.tqdm(data, total = len(data)):
        gender, prof = entry["gender"].lower(), entry["raw_title"].lower()
        raw, start_index = entry["raw"], entry["start_pos"]
        all_data.append({"g": g2i[gender], "p": p2i[prof], "text": raw, "start": start_index})


    train_dev, test = sklearn.model_selection.train_test_split(all_data, test_size = 0.2, random_state = 0)
    train, dev = sklearn.model_selection.train_test_split(train_dev, test_size = 0.3, random_state = 0)
    print("Train size: {}; Dev size: {}; Test size: {}".format(len(train), len(dev), len(test)))
    return (train, dev, test), (g2i, i2g, p2i, i2p, w2i, i2w)


In [7]:
data = load_data()
counter = count_profs_and_gender(data)
data = filter_dataset(data, topk = 28)
(train, dev, test), (g2i, i2g, p2i, i2p, w2i, i2w) = split_train_dev_test(data)

Top-k professions: ['associate professor', 'assistant professor', 'attorney', 'journalist', 'photographer', 'teacher', 'psychologist', 'physician', 'architect', 'poet', 'nurse', 'painter', 'filmmaker', 'composer', 'model', 'software engineer', 'dentist', 'surgeon', 'comedian', 'psychotherapist', 'dietitian', 'pastor', 'plastic surgeon', 'accountant', 'trial lawyer', 'orthopedic surgeon', 'chiropractor', 'trial attorney']


100%|██████████| 148816/148816 [00:00<00:00, 671866.62it/s]


Train size: 83336; Dev size: 35716; Test size: 29764


### get input representatons 

In [8]:
def load_word_vectors(fname = "../data/embeddings/vecs.filtered.with_gendered.glove.txt"):
    
    model = KeyedVectors.load_word2vec_format(fname, binary=False)
    vecs = model.vectors
    words = list(model.vocab.keys())
    return model, vecs, words

def get_embeddings_based_dataset(data: List[dict], word2vec_model):
    
    X, Y = [], []
    for entry in data:
        text, start, y = entry["text"], entry["start"], entry["p"]
        text = text.lower()
        words = text[start + 1:].split(" ")
        bagofwords = np.sum([word2vec_model[w] for w in words if w in word2vec_model], axis = 0)
        X.append(bagofwords)
        Y.append(y)
    
    return X,Y

def get_BOW_based_dataset(data: List[dict], w2i):
    
    vectorizer = DictVectorizer(sparse = True)
    X, Y = [], []
    data_dicts = []
    
    for entry in tqdm.tqdm(data, total = len(data)):
        text, start, y = entry["text"], entry["start"], entry["p"]
        text = text.lower()
        words = text[start + 1:].split(" ")
        entry_dict = {w:w2i[w] if w in w2i else w2i["<UNK>"] for w in words}
        data_dicts.append(entry_dict)
        Y.append(y)
    
    X = vectorizer.fit_transform(data_dicts)
    return X,Y
    
def get_bert_based_dataset(data: List[dict]):
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased').cuda()
    model.eval()
    
    X, Y = [], []
    
    for entry in tqdm.tqdm(data, total = len(data)):
        text, start, y = entry["text"], entry["start"], entry["p"]
        text = text.lower()            
        tokenized_text = tokenizer.tokenize(sentence_str)    
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens])
        with torch.no_grad():
              outputs = model(tokens_tensor) 
            

In [None]:
word2vec, vecs, words = load_word_vectors("../data/embeddings/wiki-news-300d-1M-subword.vec") #load_word2vec()
X_train, Y_train = get_BOW_based_dataset(train, w2i)
X_dev, Y_dev = get_BOW_based_dataset(dev, w2i) 
#X_train, Y_train = get_embeddings_based_dataset(train, word2vec)
#X_dev, Y_dev =  get_embeddings_based_dataset(dev, word2vec)

100%|██████████| 83336/83336 [00:01<00:00, 42369.55it/s]


In [None]:
#clf = LinearSVC(verbose = 10) #LogisticRegression()
clf = LogisticRegression(solver = "sag", verbose = 10, n_jobs = 4)
clf.fit(X_train, Y_train)
print(clf.score(X_dev, Y_dev))