In [1]:
import csv
import json
import pickle
import h5py
import requests
import os
from datetime import datetime
import random
import numpy as np

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import *

In [3]:
dpath = '/scratch1/rpujari/gcr_workspace/data/'

## Loading Datasets

In [4]:
dset_names = ['jigsaw-dataset', 'hate-speech-dataset', 'hate-speech-and-offensive-language', 'ami-ibereval-dataset']

In [5]:
def load_jigsaw_data(data_path):
    train_file = list(csv.reader(open(data_path + '/train.csv')))

    train_data = []
    labels = {}

    for rnum, row in enumerate(train_file):
        if rnum == 0:
            keys = row
            for key in keys[2:]:
                labels[key] = set()
        else:
            eg = {}
            for i, val in enumerate(row):
                eg[keys[i]] = val
                if keys[i] in labels:
                    labels[keys[i]].add(val)
            train_data.append(eg)


    test_file = list(csv.reader(open(data_path + '/test.csv')))
    test_labels = list(csv.reader(open(data_path + '/test_labels.csv')))

    rnum = 0
    test_data = []
    tlabels = {}
    for row1, row2 in zip(test_file, test_labels):
        if rnum == 0:
            keys = row1 + row2
            for key in row2[1:]:
                tlabels[key] = set()
        else:
            eg = {}
            for i, val in enumerate(row1):
                eg[keys[i]] = val
            for i, val in enumerate(row2):
                eg[keys[len(row1) + i]] = val
                if keys[len(row1) + i] in tlabels:
                    tlabels[keys[len(row1) + i]].add(val)
            test_data.append(eg)
        rnum += 1
        
    return train_data, test_data

In [6]:
def load_hate_speech_data(data_path):
    fdata = list(csv.reader(open(data_path + '/annotations_metadata.csv')))
    data = []
    for rnum, row in enumerate(fdata):
        if rnum > 0:
            eg = {}
            eg['id'] = row[0]
            eg['comment_text'] = open(data_path + '/all_files/' + row[0] + '.txt').read().strip()
            if row[4] == 'noHate':
                eg['hate'] = 0
            else:
                eg['hate'] = 1
            eg['num_contexts'] = row[3]
            data.append(eg)
    return data

In [7]:
def load_hate_speech_offensive_data(data_path):
    fdata = list(csv.reader(open(data_path + 'labeled_data.csv', 'r')))
    data = []
    for row in fdata[1:]:
        eg = {}
        eg['comment_text'] = row[-1]
        for j, ritem in enumerate(row[1:-1]):
            eg[fdata[0][j + 1]] = ritem
        data.append(eg)
    return data

In [8]:
def load_misogyny_data(data_path):
    fdata = list(csv.reader(open(data_path + '/en_AMI_TrainingSet_NEW.csv', 'r', errors='ignore')))
    data = []
    for row in fdata[1:]:
        eg = {}
        eg['comment_text'] = row[1]
        for j, ritem in enumerate(row):
            if j != 1:
                eg[fdata[0][j]] = ritem
        data.append(eg)
    return data

In [9]:
js_data = load_jigsaw_data(dpath + dset_names[0] + '/')

In [10]:
hs_data = load_hate_speech_data(dpath + dset_names[1] + '/')

In [11]:
hso_data = load_hate_speech_offensive_data(dpath + dset_names[2] + '/data/')

In [12]:
misogyny_data = load_misogyny_data(dpath + dset_names[3])

In [13]:
print(len(js_data[0]), len(js_data[1]), len(hs_data), len(hso_data), len(misogyny_data))

159571 153164 10944 24783 3251


## BERT Embedding Computation

In [14]:
all_sents = []
for eg in js_data[0]:
    all_sents.append(eg['comment_text'])
for eg in js_data[1]:
    all_sents.append(eg['comment_text'])
for eg in hs_data:
    all_sents.append(eg['comment_text'])
for eg in hso_data:
    all_sents.append(eg['comment_text'])
for eg in misogyny_data:
    all_sents.append(eg['comment_text'])
print(len(all_sents))
cuda_device = 1

351713


In [15]:
model_name = 'albert-large-v2'

In [16]:
tokenizer_class = AlbertTokenizer
tokenizer = tokenizer_class.from_pretrained(model_name)
with torch.cuda.device(cuda_device):
    with torch.no_grad():
        model = AlbertModel.from_pretrained(model_name,\
                                          output_hidden_states=False,\
                                          output_attentions=False)
        model.eval()
        model.to('cuda')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=685.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=71509304.0, style=ProgressStyle(descrip…




In [17]:
def create_bert_emb(all_sents):
    if len(all_sents) > 0:
        with torch.cuda.device(cuda_device):
            all_toks = tokenizer.batch_encode_plus(all_sents, padding=True,\
                                                   add_special_tokens=True)
            torch.cuda.empty_cache()
            tok_tensor = torch.tensor([l[:512] for l in all_toks['input_ids']]).to('cuda')
            with torch.no_grad():
                all_doc_tensor = model(tok_tensor)['last_hidden_state']
                all_doc_tensor.to('cpu')
            all_attn_mask = torch.tensor(all_toks['attention_mask'])
            ret_tensor = torch.FloatTensor(all_doc_tensor.size(0), all_doc_tensor.size(-1))
            for i in range(all_doc_tensor.size(0)):
                slen = torch.sum(all_attn_mask[i, :])
                ret_tensor[i, :] = torch.mean(all_doc_tensor[i, :slen, :], dim=0)
            del tok_tensor
            del all_doc_tensor
            del all_attn_mask
            torch.cuda.empty_cache()
            return ret_tensor

In [18]:
def create_batch_bert_embs(all_sents, save_path=None, bsz=50):
    b = 0
    e = bsz
    ret_vecs = []
    t1 = datetime.now()
    while b < len(all_sents):
        batch_sents = all_sents[b:e]
        out_tensor = create_bert_emb(batch_sents)
        ret_vecs.append(out_tensor)
        b += bsz
        e += bsz
        if b % 500 == 0:
            t2 = datetime.now()
            print(b, 'done', t2 - t1)
            if save_path:
                with open(save_path, 'wb') as outfile:
                    pickle.dump(ret_vecs, outfile)
    ret_vec = torch.cat(ret_vecs, dim=0)
    if save_path:
        with open(save_path, 'wb') as outfile:
            pickle.dump(ret_vec, outfile)
    return ret_vec

In [19]:
# Takes approx. 2 hr 40 mins to finish
sample_bert_embs = create_batch_bert_embs(all_sents, save_path=dpath + 'sample_' + model_name + '_embs.pkl')

Token indices sequence length is longer than the specified maximum sequence length for this model (649 > 512). Running this sequence through the model will result in indexing errors


500 done 0:00:29.646505
1000 done 0:00:58.043672
1500 done 0:01:27.255643
2000 done 0:01:53.151553
2500 done 0:02:20.834842
3000 done 0:02:49.906549
3500 done 0:03:20.720928
4000 done 0:03:50.571965
4500 done 0:04:19.020044
5000 done 0:04:46.499958
5500 done 0:05:16.634123
6000 done 0:05:46.029051
6500 done 0:06:15.236323
7000 done 0:06:42.679575
7500 done 0:07:09.117063
8000 done 0:07:35.724320
8500 done 0:08:04.641101
9000 done 0:08:36.027787
9500 done 0:09:04.087689
10000 done 0:09:30.402630
10500 done 0:10:00.878675
11000 done 0:10:30.067722
11500 done 0:11:00.420458
12000 done 0:11:30.207525
12500 done 0:11:58.373238
13000 done 0:12:24.724016
13500 done 0:12:55.205466
14000 done 0:13:23.914582
14500 done 0:13:53.917916
15000 done 0:14:21.387391
15500 done 0:14:48.566054
16000 done 0:15:15.722616
16500 done 0:15:42.691489
17000 done 0:16:13.923926
17500 done 0:16:44.022109
18000 done 0:17:12.897033
18500 done 0:17:41.882949
19000 done 0:18:13.203501
19500 done 0:18:40.677639
20000 

158000 done 2:51:12.006482
158500 done 2:51:45.824028
159000 done 2:52:16.945337
159500 done 2:52:50.671718
160000 done 2:53:24.791908
160500 done 2:53:57.950720
161000 done 2:54:31.358849
161500 done 2:55:05.564692
162000 done 2:55:39.588681
162500 done 2:56:11.491600
163000 done 2:56:44.014347
163500 done 2:57:17.730376
164000 done 2:57:50.439243
164500 done 2:58:25.262388
165000 done 2:58:57.775578
165500 done 2:59:31.180490
166000 done 3:00:03.603777
166500 done 3:00:38.449608
167000 done 3:01:13.228034
167500 done 3:01:44.142932
168000 done 3:02:17.621151
168500 done 3:02:51.823812
169000 done 3:03:21.283509
169500 done 3:03:55.437545
170000 done 3:04:25.560633
170500 done 3:04:57.791950
171000 done 3:05:30.863844
171500 done 3:06:00.540850
172000 done 3:06:34.814107
172500 done 3:07:08.697999
173000 done 3:07:45.469327
173500 done 3:08:23.886130
174000 done 3:08:56.615415
174500 done 3:09:28.599525
175000 done 3:09:58.772095
175500 done 3:10:38.419765
176000 done 3:11:12.202746
1

310000 done 6:10:04.794856
310500 done 6:10:49.346114
311000 done 6:11:32.873427
311500 done 6:12:18.838272
312000 done 6:13:04.364712
312500 done 6:13:48.624563
313000 done 6:14:21.415955
313500 done 6:14:41.727619
314000 done 6:15:00.328953
314500 done 6:15:19.006661
315000 done 6:15:36.015184
315500 done 6:15:54.555331
316000 done 6:16:14.960893
316500 done 6:16:34.853175
317000 done 6:16:53.474682
317500 done 6:17:11.083887
318000 done 6:17:29.469607
318500 done 6:17:51.233045
319000 done 6:18:09.342449
319500 done 6:18:29.692998
320000 done 6:18:53.197682
320500 done 6:19:11.908469
321000 done 6:19:31.087894
321500 done 6:19:50.760850
322000 done 6:20:13.730407
322500 done 6:20:35.050396
323000 done 6:20:55.345384
323500 done 6:21:14.056157
324000 done 6:21:33.505056
324500 done 6:21:51.608768
325000 done 6:22:11.125191
325500 done 6:22:32.981068
326000 done 6:22:51.810141
326500 done 6:23:08.967608
327000 done 6:23:25.825994
327500 done 6:23:42.793252
328000 done 6:23:59.742581
3

In [20]:
rc = 0
with open(dpath + dset_names[0] + '/' + model_name + '_embs_train.pkl', 'wb') as outfile:
    pickle.dump(sample_bert_embs[rc:rc+len(js_data[0]), :], outfile)
rc += len(js_data[0])
with open(dpath + dset_names[0] + '/' + model_name + '_embs_test.pkl', 'wb') as outfile:
    pickle.dump(sample_bert_embs[rc:rc+len(js_data[1]), :], outfile)
rc += len(js_data[1])
with open(dpath + dset_names[1] + '/' + model_name + '_embs.pkl', 'wb') as outfile:
    pickle.dump(sample_bert_embs[rc:rc+len(hs_data), :], outfile)
rc += len(hs_data)
with open(dpath + dset_names[2] + '/' + model_name + '_embs.pkl', 'wb') as outfile:
    pickle.dump(sample_bert_embs[rc:rc+len(hso_data), :], outfile)
rc += len(hso_data)
with open(dpath + dset_names[0] + '/' + model_name + '_embs.pkl', 'wb') as outfile:
    pickle.dump(sample_bert_vecs[rc:rc+len(misogyny_data)], outfile)

NameError: name 'sample_bert_vecs' is not defined

## Stereoset Embeddings

In [None]:
batch_number = 1
data_tuples = json.load(open(dpath + 'stereoset/simulated_data/blank_split_' + str(batch_number) + '.json', 'r'))

In [None]:
data_sents = []
for i, data_eg in enumerate(data_tuples):
    inp = data_eg['input']
    comp = data_eg['suggestion']
    data_sents.append(inp.strip() + ' ' + comp.strip())
print(len(data_sents))
data_bert_embs = create_batch_bert_embs(data_sents)    

In [None]:
with open(dpath + 'stereoset/simulated_data/split_' + str(batch_number) + '_' + model_name + '_data_embs.pkl', 'wb') as outfile:
    pickle.dump(data_bert_embs, outfile)

## Creating Batched Datasets

In [None]:
with open(dpath + 'sample_' + model_name + '_embs.pkl', 'rb') as infile:
    sample_bert_embs = pickle.load(infile)
print(sample_bert_embs.shape)

In [None]:
with open(dpath + 'stereoset/simulated_data/split_' + str(batch_number) + '_' + model_name + '_data_embs.pkl', 'rb') as infile:
    data_bert_embs = pickle.load(infile)
print(data_bert_embs.shape)

In [None]:
random.seed(4056)
train_x = []
train_y = []
dev_x = []
dev_y = []
test_x = []
test_y = []
for i, data_eg in enumerate(data_tuples):
    inp = data_eg['input']
    comp = data_eg['suggestion']
    label = data_eg['label']
    if label == 'stereotype':
        dy = torch.from_numpy(np.ones((1, 1), dtype=int))
    else:
        dy = torch.from_numpy(np.zeros((1, 1), dtype=int))

    dx = data_bert_embs[i, :].view(1, -1)
    toss = random.random()
    
    if toss <= 0.7:
        train_x.append(dx)
        train_y.append(dy)
    elif toss <= 0.85:
        dev_x.append(dx)
        dev_y.append(dy)
    else:
        test_x.append(dx)
        test_y.append(dy)

In [None]:
def batchify_data(data_x, data_y, task_type='classification', batch_size=64):
    b = 0
    e = batch_size
    data_batches = []
    y_dim = data_y[0].size(-1)
    while b < len(data_x):
        data_x[b:e]
        d_X = torch.cat(data_x[b:e], dim=0).float()
        if task_type == 'classification':
            d_Y = torch.cat(data_y[b:e], dim=0).view(-1).long()
        elif task_type == 'multilabel':
            d_Y = torch.cat(data_y[b:e], dim=0).view(-1, y_dim).float()
        data_batches.append((d_X, d_Y))
        b += batch_size
        e += batch_size
    return data_batches

In [None]:
ss_tr_batches = batchify_data(train_x, train_y)
ss_de_batches = batchify_data(dev_x, dev_y)
ss_te_batches = batchify_data(test_x, test_y)

### Generating multi-label data

In [None]:
i = 0
random.seed(4056)
dsets = {}

#jigsaw data
dset_name = dset_names[0]
dsets[dset_name] = {}
dsets[dset_name]['train'] = ([], [])
dsets[dset_name]['dev'] = ([], [])
dsets[dset_name]['test'] = ([], [])
dset_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

for eg in js_data[0]:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, len(dset_labels)), dtype=int))
    for lnum, label in enumerate(dset_labels):
        if int(eg[label]) > 0:
            dy[0, lnum] = 1
    toss = random.random()
    if toss <= 0.8:
        dsets[dset_name]['train'][0].append(dx)
        dsets[dset_name]['train'][1].append(dy)
    else:
        dsets[dset_name]['dev'][0].append(dx)
        dsets[dset_name]['dev'][1].append(dy)
    i += 1

for eg in js_data[1]:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, len(dset_labels)), dtype=int))
    for lnum, label in enumerate(dset_labels):
        if int(eg[label]) > 0:
            dy[0, lnum] = 1        
    dsets[dset_name]['test'][0].append(dx)
    dsets[dset_name]['test'][1].append(dy)
    i += 1
    
    
#hate-speech data
dset_name = dset_names[1]
dsets[dset_name] = {}
dsets[dset_name]['train'] = ([], [])
dsets[dset_name]['dev'] = ([], [])
dsets[dset_name]['test'] = ([], [])
dset_labels = ['hate']
for eg in hs_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, len(dset_labels)), dtype=int))
    for lnum, label in enumerate(dset_labels):
        if int(eg[label]) > 0:
            dy[0, lnum] = 1
    toss = random.random()
    if toss <= 0.7:
        dsets[dset_name]['train'][0].append(dx)
        dsets[dset_name]['train'][1].append(dy)
    elif toss <= 0.85:
        dsets[dset_name]['dev'][0].append(dx)
        dsets[dset_name]['dev'][1].append(dy)
    else:
        dsets[dset_name]['test'][0].append(dx)
        dsets[dset_name]['test'][1].append(dy)
    i += 1
    
#hate-speech-and-offensive data
dset_name = dset_names[2]
dsets[dset_name] = {}
dsets[dset_name]['train'] = ([], [])
dsets[dset_name]['dev'] = ([], [])
dsets[dset_name]['test'] = ([], [])
dset_labels = ['class']
for eg in hso_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, len(dset_labels)), dtype=int))
    for lnum, label in enumerate(dset_labels):
        if int(eg[label]) > 0:
            dy[0, lnum] = int(eg[label])
    toss = random.random()
    if toss <= 0.7:
        dsets[dset_name]['train'][0].append(dx)
        dsets[dset_name]['train'][1].append(dy)
    elif toss <= 0.85:
        dsets[dset_name]['dev'][0].append(dx)
        dsets[dset_name]['dev'][1].append(dy)
    else:
        dsets[dset_name]['test'][0].append(dx)
        dsets[dset_name]['test'][1].append(dy) 
    i += 1

#misogyny data
dset_name = dset_names[3]
dsets[dset_name] = {}
dsets[dset_name]['train'] = ([], [])
dsets[dset_name]['dev'] = ([], [])
dsets[dset_name]['test'] = ([], [])
dset_labels = ['misogynous']
for eg in misogyny_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, len(dset_labels)), dtype=int))
    for lnum, label in enumerate(dset_labels):
        if int(eg[label]) > 0:
            dy[0, lnum] = 1
    toss = random.random()
    if toss <= 0.7:
        dsets[dset_name]['train'][0].append(dx)
        dsets[dset_name]['train'][1].append(dy)
    elif toss <= 0.85:
        dsets[dset_name]['dev'][0].append(dx)
        dsets[dset_name]['dev'][1].append(dy)
    else:
        dsets[dset_name]['test'][0].append(dx)
        dsets[dset_name]['test'][1].append(dy)
    i += 1

In [None]:
print(dsets.keys())

In [None]:
batched_dsets = {}
for label in dsets:
    if label == 'jigsaw-dataset':
        tr_batches = batchify_data(dsets[label]['train'][0], dsets[label]['train'][1], task_type='multilabel')
        de_batches = batchify_data(dsets[label]['dev'][0], dsets[label]['dev'][1], task_type='multilabel')
        te_batches = batchify_data(dsets[label]['test'][0], dsets[label]['test'][1], task_type='multilabel')
    else:
        tr_batches = batchify_data(dsets[label]['train'][0], dsets[label]['train'][1], task_type='classification')
        de_batches = batchify_data(dsets[label]['dev'][0], dsets[label]['dev'][1], task_type='classification')
        te_batches = batchify_data(dsets[label]['test'][0], dsets[label]['test'][1], task_type='classification')
    batched_dsets[label] = (tr_batches, de_batches, te_batches)
batched_dsets['stereotype'] = (ss_tr_batches, ss_de_batches, ss_te_batches)

In [None]:
with open(dpath + 'batched_dsets_multilabel_' + model_name + 'bsz64.pkl', 'wb') as outfile:
    pickle.dump(batched_dsets, outfile)

In [None]:
print(batched_dsets.keys())

### Generating binary-label data

In [None]:
i = 0
random.seed(4056)
dsets = {}

for eg in js_data[0]:
    dx = sample_bert_embs[i, :].view(1, -1)
    for label in eg:
        if label not in ['id', 'comment_text']:
            if label not in dsets:
                dsets[label] = {}
                dsets[label]['train'] = ([], [])
                dsets[label]['dev'] = ([], [])
                dsets[label]['test'] = ([], [])
            if eg[label] == 1:
                dy = torch.from_numpy(np.ones((1, 1), dtype=int))
            else:
                dy = torch.from_numpy(np.zeros((1, 1), dtype=int))
            toss = random.random()
            if toss <= 0.7:
                dsets[label]['train'][0].append(dx)
                dsets[label]['train'][1].append(dy)
            elif toss <= 0.85:
                dsets[label]['dev'][0].append(dx)
                dsets[label]['dev'][1].append(dy)
            else:
                dsets[label]['test'][0].append(dx)
                dsets[label]['test'][1].append(dy)
    i += 1

for eg in js_data[1]:
    i += 1
    
for eg in hs_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    for label in eg:
        if label not in ['id', 'comment_text', 'num_contexts']:
            if label not in dsets:
                dsets[label] = {}
                dsets[label]['train'] = ([], [])
                dsets[label]['dev'] = ([], [])
                dsets[label]['test'] = ([], [])
            if eg[label] == 1:
                dy = torch.from_numpy(np.ones((1, 1), dtype=int))
            else:
                dy = torch.from_numpy(np.zeros((1, 1), dtype=int))
            toss = random.random()
            if toss <= 0.7:
                dsets[label]['train'][0].append(dx)
                dsets[label]['train'][1].append(dy)
            elif toss <= 0.85:
                dsets[label]['dev'][0].append(dx)
                dsets[label]['dev'][1].append(dy)
            else:
                dsets[label]['test'][0].append(dx)
                dsets[label]['test'][1].append(dy)
    i += 1
    
label = 'hate_speech_offensive'
dsets[label] = {}
dsets[label]['train'] = ([], [])
dsets[label]['dev'] = ([], [])
dsets[label]['test'] = ([], [])
for eg in hso_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.ones((1, 1), dtype=int))
    dy[0, 0] = int(eg['class'])
    toss = random.random()
    if toss <= 0.7:
        dsets[label]['train'][0].append(dx)
        dsets[label]['train'][1].append(dy)
    elif toss <= 0.85:
        dsets[label]['dev'][0].append(dx)
        dsets[label]['dev'][1].append(dy)
    else:
        dsets[label]['test'][0].append(dx)
        dsets[label]['test'][1].append(dy)
        
    i += 1
    
label = 'misogyny'
dsets[label] = {}
dsets[label]['train'] = ([], [])
dsets[label]['dev'] = ([], [])
dsets[label]['test'] = ([], [])
for eg in misogyny_data:
    dx = sample_bert_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.ones((1, 1), dtype=int))
    dy[0, 0] = int(eg['misogynous'])
    toss = random.random()
    if toss <= 0.7:
        dsets[label]['train'][0].append(dx)
        dsets[label]['train'][1].append(dy)
    elif toss <= 0.85:
        dsets[label]['dev'][0].append(dx)
        dsets[label]['dev'][1].append(dy)
    else:
        dsets[label]['test'][0].append(dx)
        dsets[label]['test'][1].append(dy)
        
    i += 1

In [None]:
batched_dsets = {}
for label in dsets:
    tr_batches = batchify_data(dsets[label]['train'][0], dsets[label]['train'][1])
    de_batches = batchify_data(dsets[label]['dev'][0], dsets[label]['dev'][1])
    te_batches = batchify_data(dsets[label]['test'][0], dsets[label]['test'][1])
    batched_dsets[label] = (tr_batches, de_batches, te_batches)
batched_dsets['stereotype'] = (ss_tr_batches, ss_de_batches, ss_te_batches)

In [None]:
with open(dpath + 'batched_dsets.pkl', 'wb') as outfile:
    pickle.dump(batched_dsets, outfile)

In [None]:
print(batched_dsets.keys())

### MTurk Annotated Data Processing and Batching

In [None]:
f1 = list(csv.reader(open(dpath + 'mturk_annotation/mturk_batch1_output.csv', 'r')))
f2 = list(csv.reader(open(dpath + 'mturk_annotation/mturk_batch2_output.csv', 'r')))
f3 = list(csv.reader(open(dpath + 'mturk_annotation/mturk_batch3_output.csv', 'r')))
f4 = list(csv.reader(open(dpath + 'mturk_annotation/filtered_batch1_output.csv', 'r')))

In [None]:
print(f1[0])
print(f2[0])
print(f3[0])
print(f4[0])

In [None]:
data = {}
for row in f1[1:]:
    data[row[0]] = (row[1], 'no')
for row in f2[1:]:
    data[row[1]] = (row[2], row[5])
for row in f3[1:]:
    data[row[0]] = (row[1], row[2])
for row in f4[1:]:
    data[row[0]] = (row[1], row[2])

In [None]:
filt_data = {}
for sent in data:
    if data[sent][0] not in ['yes', 'no'] or data[sent][1] not in ['yes', 'no']:
        pass
#         print(sent)
##  Uncomment to self-annotate examples that don't have agreement
#         a1 = data[sent][0]
#         a2 = data[sent][1]
#         if data[sent][0] not in ['yes', 'no']:
#             a1 = input('Explicit?: ')
#         if data[sent][1] not in ['yes', 'no']:
#             a2 = input('Implicit?: ')
#         filt_data[sent] = (a1, a2)
    else:
        filt_data[sent] = data[sent]

In [None]:
print(len(filt_data))
s = 0
u = 0
n = 0
for sent in filt_data:
    if filt_data[sent][0] == 'yes':
        s += 1
    if filt_data[sent][1] == 'yes':
        u += 1
    if filt_data[sent][0] == 'no' and filt_data[sent][1] == 'no':
        n += 1
print(s, u, n)

In [None]:
#run the model loading and function definition files from 'BERT Embedding Computation' section before running this cell
all_sents = []
for sent in filt_data:
    all_sents.append(sent)
all_embs = create_batch_bert_embs(all_sents)

In [None]:
#stereotype-gold data binary classification task data tensor generation
mturk_dsets = {}
dset_name = 'stereotype-gold-binary'
mturk_dsets[dset_name] = {}
mturk_dsets[dset_name]['train'] = ([], [])
mturk_dsets[dset_name]['dev'] = ([], [])
mturk_dsets[dset_name]['test'] = ([], [])

i = 0
for eg in all_sents:
    dx = all_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, 1), dtype=int))
                          
    if filt_data[eg][0] == 'yes' or filt_data[eg][1] == 'yes':
        dy[0, 0] = 1
                        
    toss = random.random()
    if toss <= 0.7:
        mturk_dsets[dset_name]['train'][0].append(dx)
        mturk_dsets[dset_name]['train'][1].append(dy)
    elif toss <= 0.85:
        mturk_dsets[dset_name]['dev'][0].append(dx)
        mturk_dsets[dset_name]['dev'][1].append(dy)
    else:
        mturk_dsets[dset_name]['test'][0].append(dx)
        mturk_dsets[dset_name]['test'][1].append(dy)
    i += 1

In [None]:
#stereotype-gold data multilabel task data tensor generation
dset_name = 'stereotype-gold-multilabel'
mturk_dsets[dset_name] = {}
mturk_dsets[dset_name]['train'] = ([], [])
mturk_dsets[dset_name]['dev'] = ([], [])
mturk_dsets[dset_name]['test'] = ([], [])

i = 0
for eg in all_sents:
    dx = all_embs[i, :].view(1, -1)
    dy = torch.from_numpy(np.zeros((1, 2), dtype=int))
                          
    if filt_data[eg][0] == 'yes':
        dy[0, 0] = 1
    if filt_data[eg][1] == 'yes':
        dy[0, 1] = 1
                        
    toss = random.random()
    if toss <= 0.7:
        mturk_dsets[dset_name]['train'][0].append(dx)
        mturk_dsets[dset_name]['train'][1].append(dy)
    elif toss <= 0.85:
        mturk_dsets[dset_name]['dev'][0].append(dx)
        mturk_dsets[dset_name]['dev'][1].append(dy)
    else:
        mturk_dsets[dset_name]['test'][0].append(dx)
        mturk_dsets[dset_name]['test'][1].append(dy)
    i += 1

In [None]:
#Run batchify_data() from 'Create Batched Datasets' section before running this cell
mturk_batched_dsets = {}
for label in mturk_dsets:
    if label.endswith('multilabel'):
        tr_batches = batchify_data(mturk_dsets[label]['train'][0], mturk_dsets[label]['train'][1], task_type='multilabel')
        de_batches = batchify_data(mturk_dsets[label]['dev'][0], mturk_dsets[label]['dev'][1], task_type='multilabel')
        te_batches = batchify_data(mturk_dsets[label]['test'][0], mturk_dsets[label]['test'][1], task_type='multilabel')
    else:
        tr_batches = batchify_data(mturk_dsets[label]['train'][0], mturk_dsets[label]['train'][1], task_type='classification')
        de_batches = batchify_data(mturk_dsets[label]['dev'][0], mturk_dsets[label]['dev'][1], task_type='classification')
        te_batches = batchify_data(mturk_dsets[label]['test'][0], mturk_dsets[label]['test'][1], task_type='classification')
    mturk_batched_dsets[label] = (tr_batches, de_batches, te_batches)

In [None]:
with open(dpath + 'mturk_batched_dsets_multilabel_' + model_name + '_bsz64.pkl', 'wb') as outfile:
    pickle.dump(mturk_batched_dsets, outfile)