Experiment: Word2Vec experiment 
------
concatenates together the last four attention distributions; using only one attention head.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys; sys.path.append('../..')
import os
import torch
from bias_classification import prepare_model 
from src.utils import *
from src.experiment import AttentionExperiment, ClassificationExperiment
from models.shallow_nn import ShallowClassifier


from nltk import word_tokenize
from tqdm import tqdm
import torch

#os.chdir("/Users/sabrieyuboglu/Documents/sabri/school/cs_224u/attention_analysis")

In [5]:
experiment_dir = "../../../experiments/bias_classification/word2vec"

In [8]:
params = prepare_model.intialize_params(os.path.join(experiment_dir, "experiment_params.json"))
dataset = prepare_model.initialize_dataset(params.intermediary_task)

06/05/2019 16:46:54 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
440it [00:00, 7635.28it/s]


In [6]:
# Run to print out the current key names in the dataset
dataset.get_key_names()

dict_keys(['pre_ids', 'masks', 'pre_lens', 'post_in_ids', 'post_out_ids', 'pre_tok_label_ids', 'post_tok_label_ids', 'rel_ids', 'pos_ids', 'categories', 'index', 'bias_label'])

In [56]:
path_label_data = params.intermediary_task['task_specific_params']['target_data']
all_bert_toks = prepare_model.get_sample_toks(path_label_data)

440it [00:00, 79743.07it/s]


In [11]:
glove = glove2dict("../../../tasks/bias_classification/data/word_vectors/glove.6B.100d.txt")

In [60]:
def bert_to_word_tokens(tokens):
    """
    Detokenizes an input tokenized by BertTokenizer.
    @param tokens (list[str])
    @returns sent (str) detokenized str
    """
    bert_to_word = {}
    word_tokens = []
    word_token_idx = 0
    for bert_token_idx, bert_token in enumerate(tokens):
        bert_to_word[bert_token_idx] = word_token_idx
        if bert_token.startswith("##"):
            word_tokens[-1] += bert_token.replace("##", "")
        else:
            word_tokens.append(bert_token)
            word_token_idx += 1
                        
    return word_tokens, bert_to_word

In [78]:
def add_bias_embeddings(dataset, path, max_len=80):
    """
    Adds embeddings 
    @
    """
    token2glove = glove #glove2dict(path)
    d = len(next(iter(token2glove.values())))
    intermediary_labels = dataset.get_val('pre_tok_label_ids')
    embeddings = []
    for entry in tqdm(dataset):
        idx = int(entry["index"])
        bert_toks = all_bert_toks[idx]
        word_tokens, bert_to_word = bert_to_word_tokens(bert_toks)
        
        bias_idx = entry['pre_tok_label_ids'].to(dtype=torch.int).flatten().tolist().index(1)
        bias_word = word_tokens[bert_to_word[bias_idx]]
        
        embedding = token2glove.get(bias_word, np.zeros(d))
        embeddings.append(embedding)
        
    tensor = torch.tensor(np.stack(embeddings), dtype=torch.float32)  # num_entries, dim
    dataset.add_data(tensor, "glove_embed")

In [79]:
add_bias_embeddings(dataset, "../../../tasks/bias_classification/data/word_vectors/glove.6B.100d.txt")

100%|██████████| 332/332 [00:00<00:00, 14323.28it/s]


In [51]:
def add_sent_embeddings(dataset, path, max_len=80):
    """
    Adds embeddings 
    @
    """
    token2glove = glove #glove2dict(path)
    d = len(next(iter(token2glove.values())))
    entries = []
    for entry in tqdm(dataset):
        idx = int(entry["index"])
        sent = detokenize(sentence_toks[idx])        
        
        embeddings = []
        for token in word_tokenize(sent):
            # give unkown tokens 0 embedding
            embedding = token2glove.get(token, np.zeros(d))
            embeddings.append(embedding)
        
        # pad to max_len 
        if len(embeddings) < max_len:
            embeddings.extend([-1 * np.ones(d)] * (max_len - len(embeddings)))
        embeddings = embeddings[:max_len]
        tensor = np.stack(embeddings)[b] # max_len, dim
        entries.append(tensor)
    entries_tensor = torch.tensor(np.stack(entries), dtype=torch.float32)
    dataset.add_data(entries_tensor, "glove_embed")

In [52]:
add_embeddings(dataset, "../../../tasks/bias_classification/data/word_vectors/glove.6B.100d.txt")

100%|██████████| 332/332 [00:00<00:00, 2705.77it/s]


Run the cell below to reset params 
-----

In [87]:
#reloading classification experiment with new experiments
params = prepare_model.intialize_params("../../../experiments/bias_classification/word2vec/experiment_params.json") 
classification_experiment = prepare_model.initialize_classification_experiment(params.final_task)

In [88]:
print(params.final_task)

{'model': 'shallow_nn', 'window_size': 5, 'input_dim': 100, 'hidden_dim': 10, 'output_dim': 1, 'data_split': {'train_split': 0.7, 'eval_split': 0.3, 'test_split': 0}, 'training_params': {'optimizer': 'adam', 'loss': 'bce_with_logits', 'num_epochs': 200, 'batch_size': 32, 'lr': 0.01}}


Boostrapping procedure 

In [None]:
statistics = run_boostrapping(classification_experiment, dataset, params, input_key='glove_embed', label_key='bias_label', threshold=0.42)

In [93]:
statistics

{'auc': [(0.8176440250721501, 0.9243149494949495), 0.87503878710997],
 'accuracy': [(0.74, 0.8700000000000001), 0.8078250000000001]}

New run 

In [35]:
#reloading classification experiment with new experiments
params = prepare_model.intialize_params("experiment_params.json") 
classification_experiment = prepare_model.initialize_classification_experiment(params.final_task)

In [36]:
print(params.final_task)

{'model': 'shallow_nn', 'window_size': 5, 'input_dim': 11, 'hidden_dim': 3, 'output_dim': 1, 'data_split': {'train_split': 0.7, 'eval_split': 0.3, 'test_split': 0}, 'training_params': {'optimizer': 'adam', 'loss': 'bce_with_logits', 'num_epochs': 200, 'batch_size': 32, 'lr': 0.01}}


Boostrapping procedure 

In [None]:
statistics = run_boostrapping(classification_experiment, dataset, params, input_key='attention_dist', label_key='bias_label', threshold=0.42)

In [22]:
statistics

{'auc': [(0.49685111400227683, 0.8255225075881711), 0.7107909485455589],
 'accuracy': [(0.5869186046511627, 0.7502906976744186), 0.6730232558139536]}

## No bootstrapping -- cannot calculate 95% confidence intervals 

In [80]:
data_split = params.final_task['data_split']
batch_size = params.final_task['training_params']['batch_size']
train_dataloader, eval_dataloader, test_dataloader = dataset.split_train_eval_test(**data_split, batch_size=batch_size)

In [81]:
losses, evaluations = classification_experiment.train_model(train_dataloader, eval_dataloader, input_key='glove_embed', label_key='bias_label', threshold=0.42)



In [82]:
avg_losses = [average_data(epoch_losses) for epoch_losses in losses]
avg_predictions = [average_data(epoch_evaluations) for epoch_evaluations in evaluations]

In [92]:
min_loss, max_loss, avg_loss = get_statistics(avg_losses, "loss")
print(max_loss)

0.652036083155665


In [84]:
min_auc, max_auc, avg_auc = get_statistics(avg_predictions, "auc")

In [85]:
print(max_auc)

0.9033766233766234


In [86]:
print(min_auc)

0.8266955266955267
