Experiment: POS predictions on large dataset
------

In [1]:
import sys; sys.path.append('../..')
import torch
from bias_classification import prepare_model 
from src.utils import *

%load_ext autoreload
%autoreload 2

In [2]:
params = prepare_model.intialize_params("experiment_params.json")
dataset = prepare_model.initialize_dataset(params.intermediary_task)

06/05/2019 19:52:57 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
440it [00:00, 7007.36it/s]


In [3]:
# Run to print out the current key names in the dataset
dataset.get_key_names()

dict_keys(['pre_ids', 'masks', 'pre_lens', 'post_in_ids', 'post_out_ids', 'pre_tok_label_ids', 'post_tok_label_ids', 'rel_ids', 'pos_ids', 'categories', 'index', 'bias_label'])

In [4]:
path_label_data = params.intermediary_task['task_specific_params']['target_data']
sentence_toks = prepare_model.get_sample_toks(path_label_data)

440it [00:00, 133692.68it/s]


In [5]:
indices = dataset.get_val('index')
intermediary_labels = dataset.get_val('pre_tok_label_ids')
bias_indices = [label.flatten().tolist().index(1) for label in intermediary_labels]

In [6]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /sailhome/rdm/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
pos_matrix, valid_pos_labels = sentence_to_POS_matrix(sentence_toks, bias_indices, 
                                                      indices, return_pos_list=True)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [8]:
dataset.add_data(pos_matrix, "pos")

## Basic Model Training

In [9]:
classification_experiment = prepare_model.initialize_classification_experiment(params.final_task)
data_split = params.final_task['data_split']
batch_size = params.final_task['training_params']['batch_size']
dataset.shuffle_data()
train_dataloader, eval_dataloader, test_dataloader = dataset.split_train_eval_test(**data_split, batch_size=batch_size)

In [10]:
train_accs, evaluations = classification_experiment.train_model(train_dataloader, eval_dataloader, input_key='pos', label_key='bias_label', threshold=0.42)

HBox(children=(IntProgress(value=0, description='epoch training', max=200, style=ProgressStyle(description_wid…



In [11]:
avg_train_acc = [average_data(accuracies) for accuracies in train_accs]
avg_predictions = [average_data(epoch_evaluations) for epoch_evaluations in evaluations]

In [12]:
min_train_acc, max_train_acc, avg_train_acc = get_statistics(avg_train_acc, "accuracy")

In [13]:
print(max_train_acc)

0.831896551724138


In [14]:
min_auc, max_auc, avg_auc = get_statistics(avg_predictions, "auc")

In [15]:
print(max_auc)

0.8949513709974236


In [16]:
print(min_auc)

0.823951002944424


Running inference to create labels for large dataset
-----

In [30]:
data_path = params.intermediary_task['task_specific_params']['full_data']
full_dataset = prepare_model.initialize_dataset(params.intermediary_task, data_path=data_path, labels_path=None)

06/05/2019 19:55:42 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ../../../tasks/bias_classification/results/cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
53803it [00:08, 6171.67it/s]


In [31]:
indices = full_dataset.get_val('index')
intermediary_labels = full_dataset.get_val('pre_tok_label_ids')
skip_indices = []
bias_indices = []
for i,label in enumerate(intermediary_labels):
    try:
        bias_indices.append(label.to(torch.int).flatten().tolist().index(1))
    except:
        skip_indices.append(i)
full_dataset.remove_indices(skip_indices)

In [32]:
path_label_data = params.intermediary_task['task_specific_params']['full_data']
full_sentence_toks = prepare_model.get_sample_toks(path_label_data)
indices = full_dataset.get_val('index')

53803it [00:00, 107604.20it/s]


In [33]:
print(full_sentence_toks[31774])

['some', 'other', 'interesting', 'studies', 'have', 'been', 'done', 'that', 'show', 'the', 'possibility', 'of', 'handed', '##ness', 'occurring', 'as', 'early', 'as', 'in', 'the', 'womb', 'which', 'would', 'indicate', 'a', 'biological', 'process', '.']


In [34]:
pos_matrix, skip_indices = sentence_to_POS_matrix(full_sentence_toks, bias_indices, indices, valid_pos_label=valid_pos_labels)
full_dataset.remove_indices(skip_indices)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [22]:
print(pos_matrix.shape)

(51444, 18)


In [23]:
full_dataset.add_data(pos_matrix, "pos")

In [24]:
dataloader = full_dataset.return_dataloader(batch_size=64)

In [25]:
keys = {"input_key":"pos", "label_key":""}
predictions, evaluations = classification_experiment.run_inference(dataloader, **keys)

In [26]:
predictions_array = np.asarray(predictions)

In [35]:
full_dataset.add_data(pos_matrix, "pos_weak_labels")

In [36]:
full_dataset.get_key_names()

dict_keys(['pre_ids', 'masks', 'pre_lens', 'post_in_ids', 'post_out_ids', 'pre_tok_label_ids', 'post_tok_label_ids', 'rel_ids', 'pos_ids', 'categories', 'index', 'pos_weak_labels'])

In [37]:
import pickle 

pickle.dump(full_dataset, open("pos_weak_labels", 'wb'))