# Imports and path setup




In [None]:
!git clone https://github.com/SIDN-IAP/global-model-repr.git tutorial_code
!pip install transformers==2.1
!pip install spacy ftfy==4.4.3
!python -m spacy download en

import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import sys
# sys.path.append('global-model-repr/')
sys.path.append('/content/tutorial_code')
# sys.path.append('..')
from probing.utils import get_sentence_repr, get_model_and_tokenizer, get_pos_data

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    print("Change runtime type to include a GPU.")
    device = torch.device('cpu')
print("device:", device)

fatal: destination path 'tutorial_code' already exists and is not an empty directory.
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
device: cuda


# Get data for part-of-speech tagging
A probing experiment requires supervised data with linguistic annotation for the property we wish to study. We will use part-of-speech (POS) tagging, a classical problem in NLP. We will use (a portion of) the English Web dependency treebank from the Universal Dependencies project (https://universaldependencies.org/). The dataset comes with POS information, morphological features (tense, gender, number, etc.), and dependency labels (subject, object, etc.), so it can be used to stufy various aspects of language.

In this exercise, we will use the POS information, available via `get_pos_data`.

In [None]:
train_sentences, train_labels, test_sentences, test_labels, _, _, label2index = get_pos_data("/content/tutorial_code/probing", frac=0.1)
# train_sentences, train_labels, test_sentences, test_labels, _, _, label2index = get_pos_data("../probing", frac=0.1)
num_labels = len(label2index)
print("Training sentences:", len(train_sentences), "Test sentences:", len(test_sentences))
print("Unique labels:", num_labels)

Training sentences: 1254 Test sentences: 208
Unique labels: 17


# Set up model
A probing experiment also requires a probing model, also known as an auxiliary classifier. Here we define a simple linear classifier, which takes a word representation as input and applies a linear transformation to map it to the label space.

We also need a pre-model deep neural network to study. We will use the popular BERT model (https://www.aclweb.org/anthology/N19-1423.pdf), available via the HuggingFace Transformers library (https://huggingface.co/transformers/).  The library provides a number of other models that you can easily experiment with thanks to the unified API.

In [None]:
class Classifier(torch.nn.Module):

    def __init__(self, input_dim, output_dim):
        super(Classifier, self).__init__()

        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, input):
        output = self.linear(input)
        return output


class NonlinearClassifier(torch.nn.Module):

    def __init__(self, input_dim, output_dim):
        NotImplementedError

    def forward(self, input):
        NotImplementedError


def build_classifier(emb_dim, num_labels, device='cpu'):

    classifier = Classifier(emb_dim, num_labels).to(device)
    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(classifier.parameters())

    return classifier, criterion, optimizer


def build_nonlinear_classifier(emb_dim, num_labels, device='cpu'):

    classifier = NonlinearClassifier(emb_dim, num_labels).to(device)
    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(classifier.parameters())

    return classifier, criterion, optimizer


model_name = 'bert-base-cased'
# get model and tokenizer from Transformers
model, tokenizer, sep, emb_dim = get_model_and_tokenizer(model_name, device)
# build classifier
classifier, criterion, optimizer = build_classifier(emb_dim, num_labels, device)

100%|██████████| 313/313 [00:00<00:00, 214127.74B/s]
100%|██████████| 435779157/435779157 [00:05<00:00, 79252657.99B/s]
100%|██████████| 213450/213450 [00:00<00:00, 6013758.14B/s]


In [None]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
print(classifier)

Classifier(
  (linear): Linear(in_features=768, out_features=17, bias=True)
)


# Train
Given a pre-trained model, a probing classifier, and supervised linguistic annotations, we can run a probing experiment. First, we'll define a training function that trains the classifier on the linguistic annotations. This is a simple implementation, but one could implement various checks like early stopping on a development set, etc.

In [None]:
def train(num_epochs, train_representations, train_labels,
          model, tokenizer, sep, model_name, device,
          classifier, criterion, optimizer, batch_size=32):

    num_total = train_representations.shape[0]
    for i in range(num_epochs):
        total_loss = 0.
        num_correct = 0.
        for batch in range(0, num_total, batch_size):
            batch_repr = train_representations[batch: batch+batch_size]
            batch_labels = train_labels[batch: batch+batch_size]

            optimizer.zero_grad()

            out = classifier(batch_repr)
            pred = out.max(1)[1]
            num_correct += pred.long().eq(batch_labels.long()).cpu().sum().item()
            loss = criterion(out, batch_labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
#         print('Training epoch: {}, loss: {}, accuracy: {}'.format(i, total_loss/num_total, num_correct/num_total))
    return total_loss/num_total, num_correct/num_total

# Evaluate
Given the trained classifier, we'll evaluate its performance on the test set.

In [None]:
def evaluate(test_representations, test_labels,
             model, tokenizer, sep, model_name, device,
             classifier, criterion, batch_size=32):

    num_correct = 0.
    num_total = test_representations.shape[0]
    total_loss = 0.
    with torch.no_grad():
        for batch in range(0, num_total, batch_size):
            batch_repr = test_representations[batch: batch+batch_size]
            batch_labels = test_labels[batch: batch+batch_size]

            out = classifier(batch_repr)
            pred = out.max(1)[1]
            num_correct += pred.long().eq(batch_labels.long()).cpu().sum().item()
            total_loss += criterion(out, batch_labels)

#     print('Testing loss: {}, accuracy: {}'.format(total_loss/num_total, num_correct/num_total))
    return total_loss/num_total, num_correct/num_total

# Generate representations with pretrained model
Here we collect representations from the pre-trained model. We also apply a few data transformations for convenience.
The end result is `train_sentence_representations`, a list of tensors, where each tensor has representations from one layer in the deep model. Each tensor has dimensions num_word in the corpus x representation dimensionality.

Note: The function `get_sentence_repr` hides details about model selection and tokenization that are not important for our purposes, but if you want to experiment with other models you may need to change its behavior.

In [None]:
# top-level list: sentences, second-level lists: layers, third-level tensors of num_words x representation_dim
train_sentence_representations = [get_sentence_repr(sentence, model, tokenizer, sep, model_name, device)
                                  for sentence in train_sentences]
test_sentence_representations = [get_sentence_repr(sentence, model, tokenizer, sep, model_name, device)
                                  for sentence in test_sentences]

# top-level list: layers, second-level lists: sentences
train_sentence_representations = [list(l) for l in zip(*train_sentence_representations)]
test_sentence_representations = [list(l) for l in zip(*test_sentence_representations)]

# concatenate all word represenations
train_representations_all = [torch.tensor(np.concatenate(train_layer_representations, 0)).to(device) for train_layer_representations in train_sentence_representations]
test_representations_all = [torch.tensor(np.concatenate(test_layer_representations, 0)).to(device) for test_layer_representations in test_sentence_representations]
# concatenate all labels
train_labels_all = torch.tensor(np.concatenate(train_labels, 0)).to(device)
test_labels_all = torch.tensor(np.concatenate(test_labels, 0)).to(device)

# Experiment 1: Evaluate representation for POS quality
In this experiment, we train and evaluate a classifier on the top-level representations of BERT on the task of POS tagging. The test accuracy can be thought of as a measure of the quality of the representations for the POS property.

In [None]:
# Take final layer representations
train_representations = train_representations_all[-1]
test_representations = test_representations_all[-1]

# train
train_loss, train_accuracy = train(10, train_representations, train_labels_all,
          model, tokenizer, sep, model_name, device,
          classifier, criterion, optimizer)
# test
test_loss, test_accuracy = evaluate(test_representations, test_labels_all,
         model, tokenizer, sep, model_name, device,
         classifier, criterion)
print("Train accuracy: {}, Test accuracy: {}".format(train_accuracy, test_accuracy))

Train accuracy: 0.9267587893157485, Test accuracy: 0.9175143741707209


# Experiment 2: Compare representation quality across layers
One of the major questions in neural network interpretability is how information is organized in different parts of the deep model, such as its layers.

**TODO**: Train and evaluate a separate classifier per each layer. Print the test accuracies from all layers. You may also want to store the accuracites in a list `test_accs`, as this will come in handly later.


Notice the test accuracy results for this task, and how deeper is not always better in this case.


**Hints**: You can get the number of layers via `len(train_representations_all)`. Then, loop over layers and get the representations for layer `l` using `train_representation_all[l]` and `test_representations_all[l]`. Remember to build a new classifier for every such experiment using `build_classifier`.


# Experiment 3: Non-linear classifier
Does the probing accuracy depend on the probing model? We have previously trained a linear probing classifier. Next we will train a non-linear classifier with one hidden layer.

**TODO**: Implement the `__init__` and `forward` methods in the `NonlinearClassifier` class above. Then, repeat the layer-wise probing experiment, this time building a non-linear classifier using `build_nonlinear_classifier`.
As before, store the accuracies in a list `test_accs`.

Does the layer-wise pattern change with a different probing model? What does this tell us about the information encoded in the model's internal representations?

**Hints**: It is enough to use a one-hidden layer fully-connected classifier, where the number of hidden units is equal to the input dimensionality. However, you may experiment with other classifiers if you wish.

# Experiment 4: Control labels

In this experiment we test to see how much of the good performance from Experiments 2 and 3 actually come from things the pretrained model learned, and how much of it just comes from the probe model. To test this, we use a method from Hewitt and Liang (https://arxiv.org/pdf/1909.03368.pdf). We make a <i>control task</i> which is unrelated to the POS task and do the same probing procedure on the control task. We then measure the <i>selectivity</i> of layers; the difference between their probed accuracy on the POS task and on the control task. If a layer has learned substantial things about the POS task in particular, it should be much better at the POS task than the control task; i.e. it should have high selectivity.

Following Hewitt and Liang, we use the following control task for POS tagging. Each word identity will be assigned a random POS tag, with the distribution of POS tags weighted according to their actual appearance. Each word identity will always have the same tag every time it appears. We then train and test the layers on predicting this tag from the embedding. Note that this tag is a deterministic function of the word identity, so high selectivity means the embedding actually has forgotten something about the word identity.

**TODO**: the code below provides control train and test labels, `control_train_labels` and `control_test_labels`. Use these to run another probing experiment with the non-linear classifier and record the resulting test accuracies in a new list, `control_test_accs`. Then, calculate the selectivity score at every layer as the differenence between the accuracy on the POS task and the accuracy on the control task.

How are the selectivity results different from the previous accuracy results? What does this tell us about the model's internal representations?


In [None]:
import random

vocabulary = set(
    word
      for sentence in (train_sentences + test_sentences)
      for word in sentence
)
all_labels = train_labels_all.tolist()

# for each word identity, choose a random label, weighted according
# to how often the labels actually appear
control_map = {word: random.choice(all_labels) for word in vocabulary}

# assign each word the label according to its identity
control_train_labels = [torch.tensor([control_map[word] for word in sentence]) for sentence in train_sentences]
control_test_labels = [torch.tensor([control_map[word] for word in sentence]) for sentence in test_sentences]
control_train_labels = torch.tensor(np.concatenate(control_train_labels, 0)).to(device)
control_test_labels = torch.tensor(np.concatenate(control_test_labels, 0)).to(device)

In [None]:
# Train and evaluate a classifier on the control task at every layer

In [None]:
# Calculate the selectivity score per layer