Experiment: Uses Recansen's Features to generate weak labels
------


In [1]:
import sys; sys.path.append('../..')
import torch
from bias_classification import prepare_model 
from src.utils import *
from tasks.bias_classification.lib.tagging.features import Featurizer

%load_ext autoreload
%autoreload 2

In [2]:
params = prepare_model.intialize_params("experiment_params.json")
dataset = prepare_model.initialize_dataset(params.intermediary_task)

06/06/2019 10:15:43 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
440it [00:00, 6074.72it/s]


In [3]:
path_label_data = params.intermediary_task['task_specific_params']['target_data']
all_bert_toks = prepare_model.get_sample_toks(path_label_data)

440it [00:00, 140331.06it/s]


In [4]:
tok2id = prepare_model.get_tok2id(params.intermediary_task)

06/06/2019 10:15:45 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [5]:
featurizer = Featurizer(tok2id, params=params.intermediary_task)

In [6]:
def bert_to_word_tokens(tokens):
    """
    Detokenizes an input tokenized by BertTokenizer.
    @param tokens (list[str])
    @returns sent (str) detokenized str
    """
    bert_to_word = {}
    word_tokens = []
    word_token_idx = 0
    for bert_token_idx, bert_token in enumerate(tokens):
        bert_to_word[bert_token_idx] = word_token_idx
        if bert_token.startswith("##"):
            word_tokens[-1] += bert_token.replace("##", "")
        else:
            word_tokens.append(bert_token)
            word_token_idx += 1
                        
    return word_tokens, bert_to_word

In [7]:
def add_bias_features(dataset, featurizer):
    """
    Adds embeddings 
    @
    """
    intermediary_labels = dataset.get_val('pre_tok_label_ids')
    embeddings = []
    for entry in tqdm(dataset):
        idx = int(entry["index"])
        bert_toks = all_bert_toks[idx]
        word_tokens, bert_to_word = bert_to_word_tokens(bert_toks)
        
        bias_idx = entry['pre_tok_label_ids'].to(dtype=torch.int).flatten().tolist().index(1)
        bias_word = word_tokens[bert_to_word[bias_idx]]
        
        features = featurizer.features(entry["pre_ids"].tolist(), 
                                       entry["rel_ids"].tolist(), 
                                       entry["pos_ids"].tolist())
        embeddings.append(features[bias_idx, :])
        
    tensor = torch.tensor(np.stack(embeddings), dtype=torch.float32)  # num_entries, dim
    dataset.add_data(tensor, "bias_features")

In [8]:
add_bias_features(dataset, featurizer)

HBox(children=(IntProgress(value=0, max=332), HTML(value='')))




## Training model

In [9]:
data_split = params.final_task['data_split']
batch_size = params.final_task['training_params']['batch_size']
train_dataloader, eval_dataloader, test_dataloader = dataset.split_train_eval_test(**data_split, batch_size=batch_size)
classification_experiment = prepare_model.initialize_classification_experiment(params.final_task)

In [10]:
losses, evaluations = classification_experiment.train_model(train_dataloader, eval_dataloader, input_key='bias_features', label_key='bias_label', threshold=0.42)

HBox(children=(IntProgress(value=0, description='epoch training', max=200, style=ProgressStyle(description_wid…



In [11]:
avg_losses = [average_data(epoch_losses) for epoch_losses in losses]
avg_predictions = [average_data(epoch_evaluations) for epoch_evaluations in evaluations]

In [12]:
min_loss, max_loss, avg_loss = get_statistics(avg_losses, "loss")
print(min_loss)

0.050349742944898274


In [13]:
min_auc, max_auc, avg_auc = get_statistics(avg_predictions, "auc")

In [14]:
print(max_auc)

0.9708513708513709


In [15]:
print(min_auc)

0.9414141414141414


Generating Full dataset
-----

In [16]:
data_path = params.intermediary_task['task_specific_params']['full_data']
full_dataset = prepare_model.initialize_dataset(params.intermediary_task, data_path=data_path, labels_path=None)

06/06/2019 10:16:10 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
53803it [00:08, 6355.95it/s]


In [17]:
indices = full_dataset.get_val('index')
intermediary_labels = full_dataset.get_val('pre_tok_label_ids')
skip_indices = []
bias_indices = []
for i,label in enumerate(intermediary_labels):
    try:
        bias_indices.append(label.to(torch.int).flatten().tolist().index(1))
    except:
        skip_indices.append(i)
full_dataset.remove_indices(skip_indices)
full_dataset.add_data(np.asarray(bias_indices), "bias_idx")

In [18]:
all_bert_toks_full_data = prepare_model.get_sample_toks(data_path)

53803it [00:00, 161817.26it/s]


In [19]:
tok2id = prepare_model.get_tok2id(params.intermediary_task)

06/06/2019 10:17:09 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [20]:
featurizer = Featurizer(tok2id, params=params.intermediary_task)

In [49]:
def bert_to_word_tokens(tokens):
    """
    Detokenizes an input tokenized by BertTokenizer.
    @param tokens (list[str])
    @returns sent (str) detokenized str
    """
    bert_to_word = {}
    word_tokens = []
    word_token_idx = 0
    for bert_token_idx, bert_token in enumerate(tokens):
        if bert_token.startswith("##"):
            word_tokens[-1] += bert_token.replace("##", "")
        else:
            word_tokens.append(bert_token)
        bert_to_word[bert_token_idx] = len(word_tokens) - 1
                        
    return word_tokens, bert_to_word

IndentationError: expected an indented block (<ipython-input-49-1a09f5f05f1e>, line 12)

In [46]:
def add_bias_features(dataset, featurizer):
    """
    Adds embeddings 
    @
    """
    intermediary_labels = dataset.get_val('pre_tok_label_ids')
    embeddings = []
    for entry in tqdm(dataset):
        idx = int(entry["index"])
        bert_toks = all_bert_toks_full_data[idx]
        word_tokens, bert_to_word = bert_to_word_tokens(bert_toks)
        
        bias_idx = entry['bias_idx']
        bias_word = word_tokens[bert_to_word[bias_idx]]
        features = featurizer.features(entry["pre_ids"].tolist(), 
                                       entry["rel_ids"].tolist(), 
                                       entry["pos_ids"].tolist())
        embeddings.append(features[bias_idx, :])
        
    tensor = torch.tensor(np.stack(embeddings), dtype=torch.float32)  # num_entries, dim
    dataset.add_data(tensor, "bias_features")

In [47]:
add_bias_features(full_dataset, featurizer)

HBox(children=(IntProgress(value=0, max=52275), HTML(value='')))

['the', 'latvian', 'government', 'has', 'been', 'criticized', 'for', 'its', 'treatment', 'of', 'illegal', '##s']
11
11
['the', 'latvian', 'government', 'has', 'been', 'criticized', 'for', 'its', 'treatment', 'of', 'illegals']
['post', '-', 'trips', 'expansion', '##ism']
4
4
['post', '-', 'trips', 'expansionism']


KeyboardInterrupt: 

In [None]:
dataloader = full_dataset.return_dataloader(batch_size=64)

In [None]:
keys = {"input_key":"bias_features", "label_key":""}
predictions, evaluations = classification_experiment.run_inference(dataloader, **keys)

In [31]:
predictions_array = np.asarray(predictions)

In [32]:
full_dataset.add_data(predictions_array, "marta_weak_labels")

In [33]:
full_dataset.get_key_names()

dict_keys(['pre_ids', 'masks', 'pre_lens', 'post_in_ids', 'post_out_ids', 'pre_tok_label_ids', 'post_tok_label_ids', 'rel_ids', 'pos_ids', 'categories', 'index', 'pos', 'pos_weak_labels'])

In [34]:
import pickle 

pickle.dump(full_dataset, open("marta_weak_labels.pkl", 'wb'))

In [35]:
full_dataset[0]['marta_weak_labels']

1