Use POS as a weak labeling function
------
For a given sentence, we first extract what the first-predicted biased word is. We know what this word will be, since our dataset contains the ground-truth labels for which words were edited for bias. We can thus extract the index of the first biased word, and then do some featurization based on that particular word. 

In this experiment the featurization is a simple POS tag for the predicted biased word.

In [1]:
import sys; sys.path.append("../../../../..")
import torch 
from src.experiment import ClassificationExperiment
from src.dataset import ExperimentDataset
from src.params import Params

%load_ext autoreload
%autoreload 2

In [2]:
params = Params.read_params("experiment_params.json")

In [3]:
# Loading in the dataset that we are using in this experiments 
# typically this dataset is the small set of ground-truth labels
dataset = ExperimentDataset.init_dataset(params.dataset)

03/02/2020 14:36:22 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ./cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
386it [00:00, 5003.67it/s]


In [4]:
# importing the Featurizer created by Pryzant et al.
from src.utils.weak_labeling_utils import get_pos_features

In [16]:
pos_features = get_pos_features(params.dataset, dataset)

03/02/2020 14:38:23 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at ./cache/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [22]:
pos_features.shape

torch.Size([324, 16])

In [25]:
dataset.add_data(pos_features, "pos_features")

##### Important: the number of pos features extracted by the nltk.pos_tag function is dependent on the number of pos unique tags that we observe. Therefore, we have to dynamically resize the final classification model.

In [26]:
num_pos_tags = pos_features.shape[1]
params.final_task['input_dim'] = num_pos_tags
params.final_task['hidden_dim'] = num_pos_tags//2

In [27]:
params.final_task

{'model': 'shallow_nn',
 'input_dim': 16,
 'hidden_dim': 8,
 'output_dim': 1,
 'data_split': {'train_split': 0.7, 'eval_split': 0.3, 'test_split': 0},
 'training_params': {'optimizer': 'adam',
  'loss': 'bce_with_logits',
  'num_epochs': 200,
  'batch_size': 32,
  'lr': 0.001}}

In [28]:
pos_features

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

### This is where the classification experiment starts

In [29]:
classification_experiment = ClassificationExperiment.init_cls_experiment(params.final_task)

In [30]:
from src.utils.classification_utils import run_bootstrapping

In [31]:
statistics = run_bootstrapping(classification_experiment, dataset, params.final_task, num_bootstrap_iters=3, input_key='pos_features', label_key='bias_label', threshold=0.42)

HBox(children=(IntProgress(value=0, description='Cross Validation Iteration', max=3, style=ProgressStyle(descr…

HBox(children=(IntProgress(value=0, description='epochs', max=200, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='epochs', max=200, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='epochs', max=200, style=ProgressStyle(description_width='init…




In [32]:
statistics

{'auc': [(0.8011171705898268, 0.9360292151825087), 0.8684690755595952],
 'accuracy': [(0.7718749999999999, 0.8344812925170069), 0.7997448979591836]}