Experiment: Uses Word2Vec Features to generate weak labels
------


In [1]:
import sys; sys.path.append('../..')
import torch
from bias_classification import prepare_model 
from src.utils import *
from tasks.bias_classification.lib.tagging.features import Featurizer

%load_ext autoreload
%autoreload 2

In [2]:
params = prepare_model.intialize_params("experiment_params.json")
dataset = prepare_model.initialize_dataset(params.intermediary_task)

06/06/2019 11:20:51 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
440it [00:00, 7286.67it/s]


In [3]:
path_label_data = params.intermediary_task['task_specific_params']['target_data']
all_bert_toks = prepare_model.get_sample_toks(path_label_data)

440it [00:00, 163651.13it/s]


In [4]:
def glove2dict(src_filename):
    """GloVe Reader.
    Parameters
    ----------
    src_filename : str
        Full path to the GloVe file to be processed.
    Returns
    -------
    dict
        Mapping words to their GloVe vectors.
    """
    data = {}
    with open(src_filename) as f:
        while True:
            try:
                line = next(f)
                line = line.strip().split()
                data[line[0]] = np.array(line[1: ], dtype=np.float)
            except StopIteration:
                break
            except UnicodeDecodeError:
                pass
    return data

In [5]:
glove = glove2dict("../../../tasks/bias_classification/data/word_vectors/glove.6B.100d.txt")

In [6]:
def bert_to_word_tokens(tokens):
    """
    Detokenizes an input tokenized by BertTokenizer.
    @param tokens (list[str])
    @returns sent (str) detokenized str
    """
    bert_to_word = {}
    word_tokens = []
    word_token_idx = 0
    for bert_token_idx, bert_token in enumerate(tokens):
        if bert_token.startswith("##"):
            word_tokens[-1] += bert_token.replace("##", "")
        else:
            word_tokens.append(bert_token)
        bert_to_word[bert_token_idx] = len(word_tokens) - 1
                        
    return word_tokens, bert_to_word

In [7]:
def add_bias_embeddings(dataset, all_bert_toks, path, max_len=80):
    """
    Adds embeddings 
    @
    """
    token2glove = glove #glove2dict(path)
    d = len(next(iter(token2glove.values())))
    intermediary_labels = dataset.get_val('pre_tok_label_ids')
    embeddings = []
    for entry in tqdm(dataset):
        idx = int(entry["index"])
        bert_toks = all_bert_toks[idx]
        word_tokens, bert_to_word = bert_to_word_tokens(bert_toks)
        
        bias_idx = entry['pre_tok_label_ids'].to(dtype=torch.int).flatten().tolist().index(1)
        bias_word = word_tokens[bert_to_word[bias_idx]]
        
        embedding = token2glove.get(bias_word, np.zeros(d))
        embeddings.append(embedding)
        
    tensor = torch.tensor(np.stack(embeddings), dtype=torch.float32)  # num_entries, dim
    dataset.add_data(tensor, "glove_embed")

In [8]:
add_bias_embeddings(dataset,all_bert_toks, "../../../tasks/bias_classification/data/word_vectors/glove.6B.100d.txt")

HBox(children=(IntProgress(value=0, max=332), HTML(value='')))




## Training model

In [9]:
data_split = params.final_task['data_split']
batch_size = params.final_task['training_params']['batch_size']
train_dataloader, eval_dataloader, test_dataloader = dataset.split_train_eval_test(**data_split, batch_size=batch_size)
classification_experiment = prepare_model.initialize_classification_experiment(params.final_task)

In [10]:
losses, evaluations = classification_experiment.train_model(train_dataloader, eval_dataloader, input_key='glove_embed', label_key='bias_label', threshold=0.42)

HBox(children=(IntProgress(value=0, description='epoch training', max=200, style=ProgressStyle(description_wid…



In [11]:
avg_losses = [average_data(epoch_losses) for epoch_losses in losses]
avg_predictions = [average_data(epoch_evaluations) for epoch_evaluations in evaluations]

In [12]:
min_loss, max_loss, avg_loss = get_statistics(avg_losses, "loss")
print(min_loss)

0.026241750705280693


In [13]:
min_auc, max_auc, avg_auc = get_statistics(avg_predictions, "auc")

In [14]:
print(max_auc)

0.8787878787878789


In [15]:
print(min_auc)

0.8097258297258296


Generating Full dataset
-----

In [16]:
data_path = params.intermediary_task['task_specific_params']['full_data']
full_dataset = prepare_model.initialize_dataset(params.intermediary_task, data_path=data_path, labels_path=None)

06/06/2019 11:22:43 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
53803it [00:08, 6445.73it/s]


In [17]:
indices = full_dataset.get_val('index')
intermediary_labels = full_dataset.get_val('pre_tok_label_ids')
skip_indices = []
bias_indices = []
for i,label in enumerate(intermediary_labels):
    try:
        bias_indices.append(label.to(torch.int).flatten().tolist().index(1))
    except:
        skip_indices.append(i)
full_dataset.remove_indices(skip_indices)
full_dataset.add_data(np.asarray(bias_indices), "bias_idx")

In [18]:
all_bert_toks_full_data = prepare_model.get_sample_toks(data_path)

53803it [00:00, 87100.42it/s]


In [19]:
add_bias_embeddings(full_dataset, all_bert_toks_full_data, "../../../tasks/bias_classification/data/word_vectors/glove.6B.100d.txt")

HBox(children=(IntProgress(value=0, max=52275), HTML(value='')))




IndexError: list index out of range

In [29]:
dataloader = full_dataset.return_dataloader(batch_size=64)

In [31]:
keys = {"input_key":"bias_features", "label_key":""}
predictions, evaluations = classification_experiment.run_inference(dataloader, **keys)

In [32]:
predictions_array = np.asarray(predictions)

In [33]:
full_dataset.add_data(predictions_array, "word2vec_weak_labels")

In [34]:
full_dataset.get_key_names()

dict_keys(['pre_ids', 'masks', 'pre_lens', 'post_in_ids', 'post_out_ids', 'pre_tok_label_ids', 'post_tok_label_ids', 'rel_ids', 'pos_ids', 'categories', 'index', 'bias_idx', 'bias_features', 'marta_weak_labels'])

In [38]:
import pickle 

pickle.dump(full_dataset, open("word2vec_weak_labels.pkl", 'wb'))

In [37]:
full_dataset[10]['word2vec_weak_labels']

0