In [1]:
import csv
import json
import pickle
import h5py
import requests
import os
from datetime import datetime
import random
import numpy as np

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import *

In [3]:
dpath = '/scratch1/rpujari/gcr_workspace/data/'

## Loading Datasets

In [21]:
dset_names = ['jigsaw-dataset', 'hate-speech-dataset', 'hate-speech-and-offensive-language', 'ami-ibereval-dataset']

In [22]:
def load_jigsaw_data(data_path):
    train_file = list(csv.reader(open(data_path + '/train.csv')))

    train_data = []
    labels = {}

    for rnum, row in enumerate(train_file):
        if rnum == 0:
            keys = row
            for key in keys[2:]:
                labels[key] = set()
        else:
            eg = {}
            for i, val in enumerate(row):
                eg[keys[i]] = val
                if keys[i] in labels:
                    labels[keys[i]].add(val)
            train_data.append(eg)


    test_file = list(csv.reader(open(data_path + '/test.csv')))
    test_labels = list(csv.reader(open(data_path + '/test_labels.csv')))

    rnum = 0
    test_data = []
    tlabels = {}
    for row1, row2 in zip(test_file, test_labels):
        if rnum == 0:
            keys = row1 + row2
            for key in row2[1:]:
                tlabels[key] = set()
        else:
            eg = {}
            for i, val in enumerate(row1):
                eg[keys[i]] = val
            for i, val in enumerate(row2):
                eg[keys[len(row1) + i]] = val
                if keys[len(row1) + i] in tlabels:
                    tlabels[keys[len(row1) + i]].add(val)
            test_data.append(eg)
        rnum += 1
        
    return train_data, test_data

In [23]:
def load_hate_speech_data(data_path):
    fdata = list(csv.reader(open(data_path + '/annotations_metadata.csv')))
    data = []
    for rnum, row in enumerate(fdata):
        if rnum > 0:
            eg = {}
            eg['id'] = row[0]
            eg['comment_text'] = open(data_path + '/all_files/' + row[0] + '.txt').read().strip()
            if row[4] == 'noHate':
                eg['hate'] = 0
            else:
                eg['hate'] = 1
            eg['num_contexts'] = row[3]
            data.append(eg)
    return data

In [24]:
def load_hate_speech_offensive_data(data_path):
    fdata = list(csv.reader(open(data_path + 'labeled_data.csv', 'r')))
    data = []
    for row in fdata[1:]:
        eg = {}
        eg['comment_text'] = row[-1]
        for j, ritem in enumerate(row[1:-1]):
            eg[fdata[0][j + 1]] = ritem
        data.append(eg)
    return data

In [25]:
def load_misogyny_data(data_path):
    fdata = list(csv.reader(open(data_path + '/en_AMI_TrainingSet_NEW.csv', 'r', errors='ignore')))
    data = []
    for row in fdata[1:]:
        eg = {}
        eg['comment_text'] = row[1]
        for j, ritem in enumerate(row):
            if j != 1:
                eg[fdata[0][j]] = ritem
        data.append(eg)
    return data

In [26]:
js_data = load_jigsaw_data(dpath + dset_names[0] + '/')

In [27]:
hs_data = load_hate_speech_data(dpath + dset_names[1] + '/')

In [28]:
hso_data = load_hate_speech_offensive_data(dpath + dset_names[2] + '/data/')

In [29]:
misogyny_data = load_misogyny_data(dpath + dset_names[3])

In [30]:
print(len(js_data[0]), len(js_data[1]), len(hs_data), len(hso_data), len(misogyny_data))

159571 153164 10944 24783 3251


In [51]:
print(js_data[0][0])

{'id': '0000997932d777bf', 'comment_text': "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27", 'toxic': '0', 'severe_toxic': '0', 'obscene': '0', 'threat': '0', 'insult': '0', 'identity_hate': '0'}


In [53]:
x = 0
for eg in js_data[0]:
    if eg['misogyny'] == '1' and eg['identity_hate'] == '1':
        print(eg['comment_text'])
        x += 1
        if x == 30:
            break

KeyError: 'misogyny'

## BERT Embedding Computation

In [31]:
all_sents = []
for eg in js_data[0]:
    all_sents.append(eg['comment_text'])
for eg in js_data[1]:
    all_sents.append(eg['comment_text'])
for eg in hs_data:
    all_sents.append(eg['comment_text'])
for eg in hso_data:
    all_sents.append(eg['comment_text'])
for eg in misogyny_data:
    all_sents.append(eg['comment_text'])
print(len(all_sents))

351713


In [None]:
with open(dpath + 'sample_bert-base-uncased_embs.pkl', 'rb') as infile:
    bert_base_embs = pickle.load(infile)

print(bert_base_embs.shape)

In [39]:
cos_sim = nn.CosineSimilarity(dim=0)
def find_sent(emb):
    for i in range(bert_base_embs.size(0)):
        csim = cos_sim(emb, bert_base_embs[i, :])
        if 1 - csim <= 1e-08:
            print(i, all_sents[i])
            break

In [40]:
find_sent(bert_base_embs[7785, :])

7785 user:ihaveapickle  user:208.113.241.125  
I do infer that these are the same person, & that this person is a  vandal, & that this person is not demonstrating any capability other than  vandalism.

[[ hopiakuta  Please do  sign  your  signature  on your  message.  %7e%7e   Thank You. -]]


In [6]:
model_name = 'xlnet-large-cased'
tokenizer_class = XLNetTokenizer
tokenizer = tokenizer_class.from_pretrained(model_name)
with torch.cuda.device(0):
    with torch.no_grad():
        model = XLNetModel.from_pretrained(model_name,\
                                          output_hidden_states=False,\
                                          output_attentions=False)
        model.eval()
        model.to('cuda')

In [7]:
def create_bert_emb(all_sents):
    if len(all_sents) > 0:
        with torch.cuda.device(0):
            all_toks = tokenizer.batch_encode_plus(all_sents, padding=True,\
                                                   add_special_tokens=True)
            torch.cuda.empty_cache()
            tok_tensor = torch.tensor([l[:512] for l in all_toks['input_ids']]).to('cuda')
            with torch.no_grad():
                all_doc_tensor = model(tok_tensor)['last_hidden_state']
                all_doc_tensor.to('cpu')
            all_attn_mask = torch.tensor(all_toks['attention_mask'])
            ret_tensor = torch.FloatTensor(all_doc_tensor.size(0), all_doc_tensor.size(-1))
            for i in range(all_doc_tensor.size(0)):
                slen = torch.sum(all_attn_mask[i, :])
                ret_tensor[i, :] = torch.mean(all_doc_tensor[i, :slen, :], dim=0)
            del tok_tensor
            del all_doc_tensor
            del all_attn_mask
            torch.cuda.empty_cache()
            return ret_tensor

In [8]:
def create_batch_bert_embs(all_sents, save_path=None, bsz=50):
    b = 0
    e = bsz
    ret_vecs = []
    t1 = datetime.now()
    while b < len(all_sents):
        batch_sents = all_sents[b:e]
        out_tensor = create_bert_emb(batch_sents)
        ret_vecs.append(out_tensor)
        b += bsz
        e += bsz
        if b % 500 == 0:
            t2 = datetime.now()
            print(b, 'done', t2 - t1)
            if save_path:
                with open(save_path, 'wb') as outfile:
                    pickle.dump(ret_vecs, outfile)
    ret_vec = torch.cat(ret_vecs, dim=0)
    if save_path:
        with open(save_path, 'wb') as outfile:
            pickle.dump(ret_vec, outfile)
    return ret_vec

In [None]:
# Takes approx. 2 hr 40 mins to finish
sample_bert_embs = create_batch_bert_embs(all_sents, save_path=dpath + 'sample_' + model_name + '_embs.pkl')

In [None]:
rc = 0
with open(dpath + dset_names[0] + '/' + model_name + '_embs_train.pkl', 'wb') as outfile:
    pickle.dump(sample_bert_embs[rc:rc+len(js_data[0]), :], outfile)
rc += len(js_data[0])
with open(dpath + dset_names[0] + '/' + model_name + '_embs_test.pkl', 'wb') as outfile:
    pickle.dump(sample_bert_embs[rc:rc+len(js_data[1]), :], outfile)
rc += len(js_data[1])
with open(dpath + dset_names[1] + '/' + model_name + '_embs.pkl', 'wb') as outfile:
    pickle.dump(sample_bert_embs[rc:rc+len(hs_data), :], outfile)
rc += len(hs_data)
with open(dpath + dset_names[2] + '/' + model_name + '_embs.pkl', 'wb') as outfile:
    pickle.dump(sample_bert_embs[rc:rc+len(hso_data), :], outfile)
rc += len(hso_data)
with open(dpath + dset_names[0] + '/' + model_name + '_embs.pkl', 'wb') as outfile:
    pickle.dump(sample_bert_embs[rc:rc+len(misogyny_data)], outfile)

## Stereoset Embeddings

In [None]:
batch_number = 1
data_tuples = json.load(open(dpath + 'stereoset/simulated_data/blank_split_' + str(batch_number) + '.json', 'r'))

In [None]:
data_sents = []
for i, data_eg in enumerate(data_tuples):
    inp = data_eg['input']
    comp = data_eg['suggestion']
    data_sents.append(inp.strip() + ' ' + comp.strip())
print(len(data_sents))
data_bert_embs = create_batch_bert_embs(data_sents)    

In [None]:
with open(dpath + 'stereoset/simulated_data/split_' + str(batch_number) + '_' + model_name + '_data_embs.pkl', 'wb') as outfile:
    pickle.dump(data_bert_embs, outfile)

## Creating Batched Datasets

In [None]:
with open(dpath + 'sample_' + model_name + '_embs.pkl', 'rb') as infile:
    sample_bert_embs = pickle.load(infile)
print(sample_bert_embs.shape)

In [None]:
with open(dpath + 'stereoset/simulated_data/split_' + str(batch_number) + '_' + model_name + '_data_embs.pkl', 'rb') as infile:
    data_bert_embs = pickle.load(infile)
print(data_bert_embs.shape)

In [None]:
random.seed(4056)
train_x = []
train_y = []
dev_x = []
dev_y = []
test_x = []
test_y = []
for i, data_eg in enumerate(data_tuples):
    inp = data_eg['input']
    comp = data_eg['suggestion']
    label = data_eg['label']
    if label == 'stereotype':
        dy = torch.from_numpy(np.ones((1, 1), dtype=int))
    else:
        dy = torch.from_numpy(np.zeros((1, 1), dtype=int))

    dx = data_bert_embs[i, :].view(1, -1)
    toss = random.random()
    
    if toss <= 0.7:
        train_x.append(dx)
        train_y.append(dy)
    elif toss <= 0.85:
        dev_x.append(dx)
        dev_y.append(dy)
    else:
        test_x.append(dx)
        test_y.append(dy)

In [9]:
def batchify_data(data_x, data_y, task_type='classification', batch_size=64):
    b = 0
    e = batch_size
    data_batches = []
    y_dim = data_y[0].size(-1)
    while b < len(data_x):
        data_x[b:e]
        d_X = torch.cat(data_x[b:e], dim=0).float()
        if task_type == 'classification':
            d_Y = torch.cat(data_y[b:e], dim=0).view(-1).long()
        elif task_type == 'multilabel':
            d_Y = torch.cat(data_y[b:e], dim=0).view(-1, y_dim).float()
        data_batches.append((d_X, d_Y))
        b += batch_size
        e += batch_size
    return data_batches

In [None]:
ss_tr_batches = batchify_data(train_x, train_y)
ss_de_batches = batchify_data(dev_x, dev_y)
ss_te_batches = batchify_data(test_x, test_y)

### Generating multi-label data

In [None]:
i = 0
random.seed(4056)
dsets = {}

#jigsaw data
dset_name = dset_names[0]
dsets[dset_name] = {}
dsets[dset_name]['train'] = ([], [])
dsets[dset_name]['dev'] = ([], [])
dsets[dset_name]['test'] = ([], [])
dset_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for eg in js_data[0]:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, len(dset_labels)), dtype=int))
    for lnum, label in enumerate(dset_labels):
        if int(eg[label]) > 0:
            dy[0, lnum] = 1
    toss = random.random()
    if toss <= 0.8:
        dsets[dset_name]['train'][0].append(dx)
        dsets[dset_name]['train'][1].append(dy)
    else:
        dsets[dset_name]['dev'][0].append(dx)
        dsets[dset_name]['dev'][1].append(dy)
    i += 1

for eg in js_data[1]:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, len(dset_labels)), dtype=int))
    for lnum, label in enumerate(dset_labels):
        if int(eg[label]) > 0:
            dy[0, lnum] = 1        
    dsets[dset_name]['test'][0].append(dx)
    dsets[dset_name]['test'][1].append(dy)
    i += 1
    
    
#hate-speech data
dset_name = dset_names[1]
dsets[dset_name] = {}
dsets[dset_name]['train'] = ([], [])
dsets[dset_name]['dev'] = ([], [])
dsets[dset_name]['test'] = ([], [])
dset_labels = ['hate']
for eg in hs_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, len(dset_labels)), dtype=int))
    for lnum, label in enumerate(dset_labels):
        if int(eg[label]) > 0:
            dy[0, lnum] = 1
    toss = random.random()
    if toss <= 0.7:
        dsets[dset_name]['train'][0].append(dx)
        dsets[dset_name]['train'][1].append(dy)
    elif toss <= 0.85:
        dsets[dset_name]['dev'][0].append(dx)
        dsets[dset_name]['dev'][1].append(dy)
    else:
        dsets[dset_name]['test'][0].append(dx)
        dsets[dset_name]['test'][1].append(dy)
    i += 1
    
#hate-speech-and-offensive data
dset_name = dset_names[2]
dsets[dset_name] = {}
dsets[dset_name]['train'] = ([], [])
dsets[dset_name]['dev'] = ([], [])
dsets[dset_name]['test'] = ([], [])
dset_labels = ['class']
for eg in hso_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, len(dset_labels)), dtype=int))
    for lnum, label in enumerate(dset_labels):
        if int(eg[label]) > 0:
            dy[0, lnum] = int(eg[label])
    toss = random.random()
    if toss <= 0.7:
        dsets[dset_name]['train'][0].append(dx)
        dsets[dset_name]['train'][1].append(dy)
    elif toss <= 0.85:
        dsets[dset_name]['dev'][0].append(dx)
        dsets[dset_name]['dev'][1].append(dy)
    else:
        dsets[dset_name]['test'][0].append(dx)
        dsets[dset_name]['test'][1].append(dy) 
    i += 1

#misogyny data
dset_name = dset_names[3]
dsets[dset_name] = {}
dsets[dset_name]['train'] = ([], [])
dsets[dset_name]['dev'] = ([], [])
dsets[dset_name]['test'] = ([], [])
dset_labels = ['misogynous']
for eg in misogyny_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, len(dset_labels)), dtype=int))
    for lnum, label in enumerate(dset_labels):
        if int(eg[label]) > 0:
            dy[0, lnum] = 1
    toss = random.random()
    if toss <= 0.7:
        dsets[dset_name]['train'][0].append(dx)
        dsets[dset_name]['train'][1].append(dy)
    elif toss <= 0.85:
        dsets[dset_name]['dev'][0].append(dx)
        dsets[dset_name]['dev'][1].append(dy)
    else:
        dsets[dset_name]['test'][0].append(dx)
        dsets[dset_name]['test'][1].append(dy)
    i += 1

In [None]:
print(dsets.keys())

In [None]:
batched_dsets = {}
for label in dsets:
    if label == 'jigsaw-dataset':
        tr_batches = batchify_data(dsets[label]['train'][0], dsets[label]['train'][1], task_type='multilabel')
        de_batches = batchify_data(dsets[label]['dev'][0], dsets[label]['dev'][1], task_type='multilabel')
        te_batches = batchify_data(dsets[label]['test'][0], dsets[label]['test'][1], task_type='multilabel')
    else:
        tr_batches = batchify_data(dsets[label]['train'][0], dsets[label]['train'][1], task_type='classification')
        de_batches = batchify_data(dsets[label]['dev'][0], dsets[label]['dev'][1], task_type='classification')
        te_batches = batchify_data(dsets[label]['test'][0], dsets[label]['test'][1], task_type='classification')
    batched_dsets[label] = (tr_batches, de_batches, te_batches)
batched_dsets['stereotype'] = (ss_tr_batches, ss_de_batches, ss_te_batches)

In [None]:
with open(dpath + 'batched_dsets_multilabel_' + model_name + 'bsz64.pkl', 'wb') as outfile:
    pickle.dump(batched_dsets, outfile)

In [None]:
print(batched_dsets.keys())

### Generating binary-label data

In [None]:
i = 0
random.seed(4056)
dsets = {}

for eg in js_data[0]:
    dx = sample_bert_embs[i, :].view(1, -1)
    for label in eg:
        if label not in ['id', 'comment_text']:
            if label not in dsets:
                dsets[label] = {}
                dsets[label]['train'] = ([], [])
                dsets[label]['dev'] = ([], [])
                dsets[label]['test'] = ([], [])
            if eg[label] == 1:
                dy = torch.from_numpy(np.ones((1, 1), dtype=int))
            else:
                dy = torch.from_numpy(np.zeros((1, 1), dtype=int))
            toss = random.random()
            if toss <= 0.7:
                dsets[label]['train'][0].append(dx)
                dsets[label]['train'][1].append(dy)
            elif toss <= 0.85:
                dsets[label]['dev'][0].append(dx)
                dsets[label]['dev'][1].append(dy)
            else:
                dsets[label]['test'][0].append(dx)
                dsets[label]['test'][1].append(dy)
    i += 1

for eg in js_data[1]:
    i += 1
    
for eg in hs_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    for label in eg:
        if label not in ['id', 'comment_text', 'num_contexts']:
            if label not in dsets:
                dsets[label] = {}
                dsets[label]['train'] = ([], [])
                dsets[label]['dev'] = ([], [])
                dsets[label]['test'] = ([], [])
            if eg[label] == 1:
                dy = torch.from_numpy(np.ones((1, 1), dtype=int))
            else:
                dy = torch.from_numpy(np.zeros((1, 1), dtype=int))
            toss = random.random()
            if toss <= 0.7:
                dsets[label]['train'][0].append(dx)
                dsets[label]['train'][1].append(dy)
            elif toss <= 0.85:
                dsets[label]['dev'][0].append(dx)
                dsets[label]['dev'][1].append(dy)
            else:
                dsets[label]['test'][0].append(dx)
                dsets[label]['test'][1].append(dy)
    i += 1
    
label = 'hate_speech_offensive'
dsets[label] = {}
dsets[label]['train'] = ([], [])
dsets[label]['dev'] = ([], [])
dsets[label]['test'] = ([], [])
for eg in hso_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.ones((1, 1), dtype=int))
    dy[0, 0] = int(eg['class'])
    toss = random.random()
    if toss <= 0.7:
        dsets[label]['train'][0].append(dx)
        dsets[label]['train'][1].append(dy)
    elif toss <= 0.85:
        dsets[label]['dev'][0].append(dx)
        dsets[label]['dev'][1].append(dy)
    else:
        dsets[label]['test'][0].append(dx)
        dsets[label]['test'][1].append(dy)
        
    i += 1
    
label = 'misogyny'
dsets[label] = {}
dsets[label]['train'] = ([], [])
dsets[label]['dev'] = ([], [])
dsets[label]['test'] = ([], [])
for eg in misogyny_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.ones((1, 1), dtype=int))
    dy[0, 0] = int(eg['misogynous'])
    toss = random.random()
    if toss <= 0.7:
        dsets[label]['train'][0].append(dx)
        dsets[label]['train'][1].append(dy)
    elif toss <= 0.85:
        dsets[label]['dev'][0].append(dx)
        dsets[label]['dev'][1].append(dy)
    else:
        dsets[label]['test'][0].append(dx)
        dsets[label]['test'][1].append(dy)
        
    i += 1

In [None]:
batched_dsets = {}
for label in dsets:
    tr_batches = batchify_data(dsets[label]['train'][0], dsets[label]['train'][1])
    de_batches = batchify_data(dsets[label]['dev'][0], dsets[label]['dev'][1])
    te_batches = batchify_data(dsets[label]['test'][0], dsets[label]['test'][1])
    batched_dsets[label] = (tr_batches, de_batches, te_batches)
batched_dsets['stereotype'] = (ss_tr_batches, ss_de_batches, ss_te_batches)

In [None]:
with open(dpath + 'batched_dsets.pkl', 'wb') as outfile:
    pickle.dump(batched_dsets, outfile)

In [None]:
print(batched_dsets.keys())

### MTurk Annotated Data Processing and Batching

In [54]:
random.seed(4056)

In [55]:
f1 = list(csv.reader(open(dpath + 'mturk_annotation/mturk_batch1_output.csv', 'r')))
f2 = list(csv.reader(open(dpath + 'mturk_annotation/mturk_batch2_output.csv', 'r')))
f3 = list(csv.reader(open(dpath + 'mturk_annotation/mturk_batch3_output.csv', 'r')))
f4 = list(csv.reader(open(dpath + 'mturk_annotation/filtered_batch1_output.csv', 'r')))

In [56]:
print(f1[0])
print(f2[0])
print(f3[0])
print(f4[0])

['Text', 'Stereotype', 'Stereotype-Anns']
['S.No.', 'Input Text', 'Stereotype-Majority', 'Popularity-Majority', 'Fakeness-Majority', 'Stereotypical Association-Majority', 'Stereotype', 'Popularity', 'Fakeness', 'Stereotypical Association']
['Text', 'Intentional-Stereotype', 'Unintentional-Association', 'Stereotype-Anns', 'Association-Anns']
['Text', 'Intentional-Stereotype', 'Unintentional-Association', 'Stereotype-Anns', 'Association-Anns']


In [57]:
data = {}
for row in f1[1:]:
    data[row[0]] = (row[1], 'no')
for row in f2[1:]:
    data[row[1]] = (row[2], row[5])
for row in f3[1:]:
    data[row[0]] = (row[1], row[2])
for row in f4[1:]:
    data[row[0]] = (row[1], row[2])

In [14]:
filt_data = {}
for sent in data:
    if data[sent][0] not in ['yes', 'no'] or data[sent][1] not in ['yes', 'no']:
        pass
#         print(sent)
##  Uncomment to self-annotate examples that don't have agreement
#         a1 = data[sent][0]
#         a2 = data[sent][1]
#         if data[sent][0] not in ['yes', 'no']:
#             a1 = input('Explicit?: ')
#         if data[sent][1] not in ['yes', 'no']:
#             a2 = input('Implicit?: ')
#         filt_data[sent] = (a1, a2)
    else:
        filt_data[sent] = data[sent]

In [60]:
# print(len(filt_data))
s = 0
u = 0
n = 0
i = 1
f1 = open('annotated_data.csv', 'w')
fw = csv.writer(f1)
fw.writerow(['S. No.', 'Text', 'Explicit Stereotype?', 'Implicit Stereotypical Association?'])
for sent in filt_data:
    if filt_data[sent][0] == 'yes':
        s += 1
    if filt_data[sent][1] == 'yes':
        u += 1
    if filt_data[sent][0] == 'no' and filt_data[sent][1] == 'no':
        n += 1
    fw.writerow([i, sent, filt_data[sent][0], filt_data[sent][1]])
    i += 1
f1.close()
print(s, u, n)

750 282 1197


In [16]:
#run the model loading and function definition files from 'BERT Embedding Computation' section before running this cell
all_sents = []
for sent in filt_data:
    all_sents.append(sent)
#     print(sent)
all_embs = create_batch_bert_embs(all_sents)

500 done 0:00:02.178143
1000 done 0:00:04.172780
1500 done 0:00:06.507899
2000 done 0:00:08.773296


In [17]:
#stereotype-gold data binary classification task data tensor generation
mturk_dsets = {}
dset_name = 'stereotype-gold-binary'
mturk_dsets[dset_name] = {}
mturk_dsets[dset_name]['train'] = ([], [])
mturk_dsets[dset_name]['dev'] = ([], [])
mturk_dsets[dset_name]['test'] = ([], [])

i = 0
for eg in all_sents:
    dx = all_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, 1), dtype=int))
                          
    if filt_data[eg][0] == 'yes' or filt_data[eg][1] == 'yes':
        dy[0, 0] = 1
                        
    toss = random.random()
    if toss <= 0.7:
        mturk_dsets[dset_name]['train'][0].append(dx)
        mturk_dsets[dset_name]['train'][1].append(dy)
    elif toss <= 0.85:
        mturk_dsets[dset_name]['dev'][0].append(dx)
        mturk_dsets[dset_name]['dev'][1].append(dy)
    else:
        mturk_dsets[dset_name]['test'][0].append(dx)
        mturk_dsets[dset_name]['test'][1].append(dy)
    i += 1

In [18]:
#stereotype-gold data multilabel task data tensor generation
dset_name = 'stereotype-gold-multilabel'
mturk_dsets[dset_name] = {}
mturk_dsets[dset_name]['train'] = ([], [])
mturk_dsets[dset_name]['dev'] = ([], [])
mturk_dsets[dset_name]['test'] = ([], [])

i = 0
for eg in all_sents:
    dx = all_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, 2), dtype=int))
                          
    if filt_data[eg][0] == 'yes':
        dy[0, 0] = 1
    if filt_data[eg][1] == 'yes':
        dy[0, 1] = 1
                        
    toss = random.random()
    if toss <= 0.7:
        mturk_dsets[dset_name]['train'][0].append(dx)
        mturk_dsets[dset_name]['train'][1].append(dy)
    elif toss <= 0.85:
        mturk_dsets[dset_name]['dev'][0].append(dx)
        mturk_dsets[dset_name]['dev'][1].append(dy)
    else:
        mturk_dsets[dset_name]['test'][0].append(dx)
        mturk_dsets[dset_name]['test'][1].append(dy)
    i += 1

In [19]:
#Run batchify_data() from 'Create Batched Datasets' section before running this cell
mturk_batched_dsets = {}
for label in mturk_dsets:
    if label.endswith('multilabel'):
        tr_batches = batchify_data(mturk_dsets[label]['train'][0], mturk_dsets[label]['train'][1], task_type='multilabel')
        de_batches = batchify_data(mturk_dsets[label]['dev'][0], mturk_dsets[label]['dev'][1], task_type='multilabel')
        te_batches = batchify_data(mturk_dsets[label]['test'][0], mturk_dsets[label]['test'][1], task_type='multilabel')
    else:
        tr_batches = batchify_data(mturk_dsets[label]['train'][0], mturk_dsets[label]['train'][1], task_type='classification')
        de_batches = batchify_data(mturk_dsets[label]['dev'][0], mturk_dsets[label]['dev'][1], task_type='classification')
        te_batches = batchify_data(mturk_dsets[label]['test'][0], mturk_dsets[label]['test'][1], task_type='classification')
    mturk_batched_dsets[label] = (tr_batches, de_batches, te_batches)

In [20]:
with open(dpath + 'mturk_batched_dsets_multilabel_' + model_name + '_bsz64.pkl', 'wb') as outfile:
    pickle.dump(mturk_batched_dsets, outfile)