Experiment: Uses Recansen's Features
------


In [1]:
import sys; sys.path.append('../..')
import torch
from bias_classification import prepare_model 
from src.utils import *
from tasks.bias_classification.lib.tagging.features import Featurizer

%load_ext autoreload
%autoreload 2

In [2]:
params = prepare_model.intialize_params("experiment_params.json")
dataset = prepare_model.initialize_dataset(params.intermediary_task)

01/21/2020 17:19:44 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
440it [00:00, 2326.37it/s]


In [3]:
dataset[0].keys()

dict_keys(['pre_ids', 'masks', 'pre_lens', 'post_in_ids', 'post_out_ids', 'pre_tok_label_ids', 'post_tok_label_ids', 'rel_ids', 'pos_ids', 'categories', 'index', 'bias_label'])

In [9]:
path_label_data = params.intermediary_task['task_specific_params']['target_data']
all_bert_toks = prepare_model.get_sample_toks(path_label_data)

440it [00:00, 94786.53it/s]


In [12]:
tok2id = prepare_model.get_tok2id(params.intermediary_task)

06/05/2019 19:23:32 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [16]:
featurizer = Featurizer(tok2id, params=params.intermediary_task)

In [92]:
def bert_to_word_tokens(tokens):
    """
    Detokenizes an input tokenized by BertTokenizer.
    @param tokens (list[str])
    @returns sent (str) detokenized str
    """
    bert_to_word = {}
    word_tokens = []
    for bert_token_idx, bert_token in enumerate(tokens):
        if bert_token.startswith("##"):
            word_tokens[-1] += bert_token.replace("##", "")
        else:
            word_tokens.append(bert_token)
        bert_to_word[bert_token_idx] = len(word_tokens) - 1

                        
    return word_tokens, bert_to_word

In [93]:
tokens = ["the", "latvian", "government", "illegal", "##s"]
bert_to_word_tokens(tokens)

(['the', 'latvian', 'government', 'illegals'], {0: 0, 1: 1, 2: 2, 3: 3, 4: 3})

In [59]:
def add_bias_features(dataset, featurizer):
    """
    Adds embeddings 
    @
    """
    intermediary_labels = dataset.get_val('pre_tok_label_ids')
    embeddings = []
    for entry in tqdm(dataset):
        idx = int(entry["index"])
        bert_toks = all_bert_toks[idx]
        word_tokens, bert_to_word = bert_to_word_tokens(bert_toks)
        
        bias_idx = entry['pre_tok_label_ids'].to(dtype=torch.int).flatten().tolist().index(1)
        bias_word = word_tokens[bert_to_word[bias_idx]]
        
        features = featurizer.features(entry["pre_ids"].tolist(), 
                                       entry["rel_ids"].tolist(), 
                                       entry["pos_ids"].tolist())
        embeddings.append(features[bias_idx, :])
        
    tensor = torch.tensor(np.stack(embeddings), dtype=torch.float32)  # num_entries, dim
    dataset.add_data(tensor, "bias_features")

In [60]:
add_bias_features(dataset, featurizer)

HBox(children=(IntProgress(value=0, max=332), HTML(value='')))




Run the cell below to reset params 
-----

In [68]:
#reloading classification experiment with new experiments
params = prepare_model.intialize_params("experiment_params.json") 
classification_experiment = prepare_model.initialize_classification_experiment(params.final_task)

In [69]:
print(params.final_task)

{'model': 'shallow_nn', 'window_size': 5, 'input_dim': 90, 'hidden_dim': 10, 'output_dim': 1, 'data_split': {'train_split': 0.7, 'eval_split': 0.3, 'test_split': 0}, 'training_params': {'optimizer': 'adam', 'loss': 'bce_with_logits', 'num_epochs': 200, 'batch_size': 32, 'lr': 0.01}}


Boostrapping procedure 

In [None]:
statistics = run_boostrapping(classification_experiment, dataset, params, input_key='bias_features', label_key='bias_label', threshold=0.42)

In [72]:
statistics

{'auc': [(0.9240057679162943, 0.9919756571930486), 0.9621985867990098],
 'accuracy': [(0.8747500000000001, 0.9592083333333334), 0.9201166666666668]}

New run 

In [35]:
#reloading classification experiment with new experiments
params = prepare_model.intialize_params("experiment_params.json") 
classification_experiment = prepare_model.initialize_classification_experiment(params.final_task)

In [36]:
print(params.final_task)

{'model': 'shallow_nn', 'window_size': 5, 'input_dim': 11, 'hidden_dim': 3, 'output_dim': 1, 'data_split': {'train_split': 0.7, 'eval_split': 0.3, 'test_split': 0}, 'training_params': {'optimizer': 'adam', 'loss': 'bce_with_logits', 'num_epochs': 200, 'batch_size': 32, 'lr': 0.01}}


Boostrapping procedure 

In [None]:
statistics = run_boostrapping(classification_experiment, dataset, params, input_key='bias_features', label_key='bias_label', threshold=0.42)

In [22]:
statistics

{'auc': [(0.49685111400227683, 0.8255225075881711), 0.7107909485455589],
 'accuracy': [(0.5869186046511627, 0.7502906976744186), 0.6730232558139536]}

## No bootstrapping -- cannot calculate 95% confidence intervals 

In [83]:
set([int(torch.tensor(172, dtype=torch.int16))]) & set([int(torch.tensor(172, dtype=torch.int16))])

{172}

In [86]:
data_split = params.final_task['data_split']
batch_size = params.final_task['training_params']['batch_size']
train_dataloader, eval_dataloader, test_dataloader = dataset.split_train_eval_test(**data_split, batch_size=batch_size)

{1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 19, 20, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 61, 62, 63, 65, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 99, 101, 102, 103, 104, 107, 108, 110, 113, 116, 117, 118, 119, 120, 121, 122, 124, 126, 127, 128, 129, 130, 131, 133, 135, 137, 138, 140, 141, 142, 143, 146, 148, 149, 150, 151, 152, 153, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 170, 171, 172, 173, 174, 175, 177, 178, 179, 180, 183, 185, 187, 189, 190, 191, 192, 193, 194, 196, 197, 198, 199, 202, 204, 207, 208, 209, 210, 211, 212, 214, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 229, 230, 232, 233, 235, 236, 238, 239, 241, 242, 243, 246, 247, 248, 249, 250, 251, 252, 253, 254, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 276, 278,

In [62]:
losses, evaluations = classification_experiment.train_model(train_dataloader, eval_dataloader, input_key='bias_features', label_key='bias_label', threshold=0.42)






In [63]:
avg_losses = [average_data(epoch_losses) for epoch_losses in losses]
avg_predictions = [average_data(epoch_evaluations) for epoch_evaluations in evaluations]

In [64]:
min_loss, max_loss, avg_loss = get_statistics(avg_losses, "loss")
print(min_loss)

0.04847464928853101


In [65]:
min_auc, max_auc, avg_auc = get_statistics(avg_predictions, "auc")

In [66]:
print(max_auc)

0.9656565656565657


In [67]:
print(min_auc)

0.9272150072150072
