# Imports and path setup




In [None]:
!git clone https://github.com/SIDN-IAP/global-model-repr.git tutorial_code
!pip install transformers==2.1
!pip install spacy ftfy==4.4.3
!python -m spacy download en

import torch
from transformers import BertTokenizer, BertModel
import numpy as np
import sys
# sys.path.append('global-model-repr/')
sys.path.append('/content/tutorial_code')
# sys.path.append('..')
from probing.utils import get_sentence_repr, get_model_and_tokenizer, get_pos_data

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    print("Change runtime type to include a GPU.")
    device = torch.device('cpu')
print("device:", device)

Cloning into 'tutorial_code'...
remote: Enumerating objects: 277, done.[K
remote: Counting objects: 100% (277/277), done.[K
remote: Compressing objects: 100% (205/205), done.[K
remote: Total 277 (delta 131), reused 201 (delta 70), pack-reused 0[K
Receiving objects: 100% (277/277), 1.19 MiB | 2.45 MiB/s, done.
Resolving deltas: 100% (131/131), done.
Collecting transformers==2.1
[?25l  Downloading https://files.pythonhosted.org/packages/5f/e5/4fb8a6215608c4036b6dd16613268a4b8958c20e4249d141e621e7f2e146/transformers-2.1.0-py3-none-any.whl (313kB)
[K     |████████████████████████████████| 317kB 8.6MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 21.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5

device: cuda


# Get data for part-of-speech tagging
A probing experiment requires supervised data with linguistic annotation for the property we wish to study. We will use part-of-speech (POS) tagging, a classical problem in NLP. We will use (a portion of) the English Web dependency treebank from the Universal Dependencies project (https://universaldependencies.org/). The dataset comes with POS information, morphological features (tense, gender, number, etc.), and dependency labels (subject, object, etc.), so it can be used to stufy various aspects of language.

In [None]:
train_sentences, train_labels, test_sentences, test_labels, _, _, label2index = get_pos_data("/content/tutorial_code/probing", frac=0.1)
# train_sentences, train_labels, test_sentences, test_labels, _, _, label2index = get_pos_data("../probing", frac=0.1)
num_labels = len(label2index)
print("Training sentences:", len(train_sentences), "Test sentences:", len(test_sentences))
print("Unique labels:", num_labels)

Training sentences: 1254 Test sentences: 208
Unique labels: 17


# Set up model
A probing experiment also requires a probing model, also known as an auxiliary classifier. Here we define a simple linear classifier, which takes a word representation as input and applies a linear transformation to map it to the label space.

We also need a pre-model deep neural network to study. We will use the popular BERT model (https://www.aclweb.org/anthology/N19-1423.pdf), available via the HuggingFace Transformers library (https://huggingface.co/transformers/).  The library provides a number of other models that you can easily experiment with thanks to the unified API.

In [None]:
class Classifier(torch.nn.Module):

    def __init__(self, input_dim, output_dim):
        super(Classifier, self).__init__()

        self.linear = torch.nn.Linear(input_dim, output_dim)

    def forward(self, input):
        output = self.linear(input)
        return output


class NonlinearClassifier(torch.nn.Module):

    def __init__(self, input_dim, output_dim):
        super(NonlinearClassifier, self).__init__()

        self.input2hidden = torch.nn.Linear(input_dim, input_dim)
        self.hidden2output = torch.nn.Linear(input_dim, output_dim)
        self.relu = torch.nn.ReLU()

    def forward(self, input):
        hidden = self.relu(self.input2hidden(input))
        output = self.hidden2output(hidden)
        return output


def build_classifier(emb_dim, num_labels, device='cpu'):

    classifier = Classifier(emb_dim, num_labels).to(device)
    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(classifier.parameters())

    return classifier, criterion, optimizer


def build_nonlinear_classifier(emb_dim, num_labels, device='cpu'):

    classifier = NonlinearClassifier(emb_dim, num_labels).to(device)
    criterion = torch.nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(classifier.parameters())

    return classifier, criterion, optimizer


model_name = 'bert-base-cased'
# get model and tokenizer from Transformers
model, tokenizer, sep, emb_dim = get_model_and_tokenizer(model_name, device)
# build classifier
classifier, criterion, optimizer = build_classifier(emb_dim, num_labels, device)

100%|██████████| 313/313 [00:00<00:00, 191512.35B/s]
100%|██████████| 435779157/435779157 [00:15<00:00, 28588841.32B/s]
100%|██████████| 213450/213450 [00:00<00:00, 835455.57B/s]


In [None]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
print(classifier)

Classifier(
  (linear): Linear(in_features=768, out_features=17, bias=True)
)


# Train
Given a pre-trained model, a probing classifier, and supervised linguistic annotations, we can run a probing experiment. First, we'll define a training function that trains the classifier on the linguistic annotations. This is a simple implementation, but one could implement various checks like early stopping on a development set, etc.

In [None]:
def train(num_epochs, train_representations, train_labels,
          model, tokenizer, sep, model_name, device,
          classifier, criterion, optimizer, batch_size=32):

    num_total = train_representations.shape[0]
    for i in range(num_epochs):
        total_loss = 0.
        num_correct = 0.
        for batch in range(0, num_total, batch_size):
            batch_repr = train_representations[batch: batch+batch_size]
            batch_labels = train_labels[batch: batch+batch_size]

            optimizer.zero_grad()

            out = classifier(batch_repr)
            pred = out.max(1)[1]
            num_correct += pred.long().eq(batch_labels.long()).cpu().sum().item()
            loss = criterion(out, batch_labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
#         print('Training epoch: {}, loss: {}, accuracy: {}'.format(i, total_loss/num_total, num_correct/num_total))
    return total_loss/num_total, num_correct/num_total

# Evaluate
Given the trained classifier, we'll evaluate its performance on the test set.

In [None]:
def evaluate(test_representations, test_labels,
             model, tokenizer, sep, model_name, device,
             classifier, criterion, batch_size=32):

    num_correct = 0.
    num_total = test_representations.shape[0]
    total_loss = 0.
    with torch.no_grad():
        for batch in range(0, num_total, batch_size):
            batch_repr = test_representations[batch: batch+batch_size]
            batch_labels = test_labels[batch: batch+batch_size]

            out = classifier(batch_repr)
            pred = out.max(1)[1]
            num_correct += pred.long().eq(batch_labels.long()).cpu().sum().item()
            total_loss += criterion(out, batch_labels)

#     print('Testing loss: {}, accuracy: {}'.format(total_loss/num_total, num_correct/num_total))
    return total_loss/num_total, num_correct/num_total

# Generate representations with pretrained model
Here we collect representations from the pre-trained model. We also apply a few data transformations for convenience.
The end result is `train_sentence_representations`, a list of tensors, where each tensor has representations from one layer in the deep model. Each tensor has dimensions num_word in the corpus x representation dimensionality.

In [None]:
# top-level list: sentences, second-level lists: layers, third-level tensors of num_words x representation_dim
train_sentence_representations = [get_sentence_repr(sentence, model, tokenizer, sep, model_name, device)
                                  for sentence in train_sentences]
test_sentence_representations = [get_sentence_repr(sentence, model, tokenizer, sep, model_name, device)
                                  for sentence in test_sentences]

# top-level list: layers, second-level lists: sentences
train_sentence_representations = [list(l) for l in zip(*train_sentence_representations)]
test_sentence_representations = [list(l) for l in zip(*test_sentence_representations)]

# concatenate all word represenations
train_representations_all = [torch.tensor(np.concatenate(train_layer_representations, 0)).to(device) for train_layer_representations in train_sentence_representations]
test_representations_all = [torch.tensor(np.concatenate(test_layer_representations, 0)).to(device) for test_layer_representations in test_sentence_representations]
# concatenate all labels
train_labels_all = torch.tensor(np.concatenate(train_labels, 0)).to(device)
test_labels_all = torch.tensor(np.concatenate(test_labels, 0)).to(device)

# Experiment 1: Evaluate representation for POS quality
In this experiment, we train and evaluate a classifier on the top-level representations of BERT on the task of POS tagging. The test accuracy can be thought of as a measure of the quality of the representations for the POS property.

In [None]:
# Take final layer representations
train_representations = train_representations_all[-1]
test_representations = test_representations_all[-1]

# train
train_loss, train_accuracy = train(10, train_representations, train_labels_all,
          model, tokenizer, sep, model_name, device,
          classifier, criterion, optimizer)
# test
test_loss, test_accuracy = evaluate(test_representations, test_labels_all,
         model, tokenizer, sep, model_name, device,
         classifier, criterion)
print("Train accuracy: {}, Test accuracy: {}".format(train_accuracy, test_accuracy))

Train accuracy: 0.9272774422998555, Test accuracy: 0.9177355152587351


# Experiment 2: Compare representation quality across layers
One of the major questions in neural network interpretability is how information is organized in different parts of the deep model, such as its layers. Here we train and evaluate a separate classifier per each layer. Notice the test accuracy results for this task, and how deeper is not always better in our case.

In [None]:
num_layers = len(train_representations_all)
train_accs, test_accs = [], []
for l in range(num_layers):
    # build new classifier for every layer experiment
    classifier, criterion, optimizer = build_classifier(emb_dim, num_labels, device)
    # get layer representation
    train_representations = train_representations_all[l]
    test_representations = test_representations_all[l]

    # train
    train_loss, train_accuracy = train(2, train_representations, train_labels_all,
          model, tokenizer, sep, model_name, device,
          classifier, criterion, optimizer)
    train_accs.append(train_accuracy)
    # test
    test_loss, test_accuracy = evaluate(test_representations, test_labels_all,
         model, tokenizer, sep, model_name, device,
         classifier, criterion)
    test_accs.append(test_accuracy)
    print("layer: {}, train accuracy: {}, test accuracy: {}".format(l, train_accuracy, test_accuracy))

layer: 0, train accuracy: 0.8641499648056904, test accuracy: 0.8474126492702344
layer: 1, train accuracy: 0.8840810580520876, test accuracy: 0.8774878372401592
layer: 2, train accuracy: 0.9236839180528285, test accuracy: 0.9294559929234851
layer: 3, train accuracy: 0.9293891008780054, test accuracy: 0.9294559929234851
layer: 4, train accuracy: 0.9329085318415885, test accuracy: 0.9296771340114993
layer: 5, train accuracy: 0.932204645648872, test accuracy: 0.9294559929234851
layer: 6, train accuracy: 0.9317971325899307, test accuracy: 0.9310039805395842
layer: 7, train accuracy: 0.9313896195309895, test accuracy: 0.9279080053073862
layer: 8, train accuracy: 0.9243878042455451, test accuracy: 0.9181777974347634
layer: 9, train accuracy: 0.9150150038898974, test accuracy: 0.9117647058823529
layer: 10, train accuracy: 0.9038639647315971, test accuracy: 0.9035824856258293
layer: 11, train accuracy: 0.8942688845256177, test accuracy: 0.8965059708093763
layer: 12, train accuracy: 0.8640017782

# Experiment 3: Non-linear classifier
Does the probing accuracy depend on the probing model? We have previously trained a linear probing classifier. Here we train a non-linear classifier with one hidden layer. Does the layer-wise pattern change with a different probing model? What does this tell us about the information encoded in the model's internal representations?

In [None]:
num_layers = len(train_representations_all)
train_accs, test_accs = [], []
for l in range(num_layers):
    # build non-linear classifier
    classifier, criterion, optimizer = build_nonlinear_classifier(emb_dim, num_labels, device)
    # get layer representation
    train_representations = train_representations_all[l]
    test_representations = test_representations_all[l]

    # train
    train_loss, train_accuracy = train(2, train_representations, train_labels_all,
          model, tokenizer, sep, model_name, device,
          classifier, criterion, optimizer)
    train_accs.append(train_accuracy)
    # test
    test_loss, test_accuracy = evaluate(test_representations, test_labels_all,
         model, tokenizer, sep, model_name, device,
         classifier, criterion)
    test_accs.append(test_accuracy)
    print("layer: {}, train accuracy: {}, test accuracy: {}".format(l, train_accuracy, test_accuracy))

layer: 0, train accuracy: 0.8708183603156374, test accuracy: 0.8485183547103051
layer: 1, train accuracy: 0.8813396065646649, test accuracy: 0.8606811145510835
layer: 2, train accuracy: 0.9305005001296632, test accuracy: 0.9279080053073862
layer: 3, train accuracy: 0.9367613825806691, test accuracy: 0.9281291463954002
layer: 4, train accuracy: 0.9393176008594821, test accuracy: 0.9303405572755418
layer: 5, train accuracy: 0.9420960989886267, test accuracy: 0.9281291463954002
layer: 6, train accuracy: 0.9442818508502204, test accuracy: 0.9336576735957541
layer: 7, train accuracy: 0.9459859963694291, test accuracy: 0.9360902255639098
layer: 8, train accuracy: 0.9441336642833327, test accuracy: 0.9307828394515701
layer: 9, train accuracy: 0.9365020560886156, test accuracy: 0.9294559929234851
layer: 10, train accuracy: 0.92909272774423, test accuracy: 0.9170720919946926
layer: 11, train accuracy: 0.9192753676879191, test accuracy: 0.8909774436090225
layer: 12, train accuracy: 0.89930722779

# Experiment 4: Control labels

In this experiment we test to see how much of the good performance from Experiments 2 and 3 actually come from things the POS model learned, and how much of it just comes from the probe model. To test this, we use a method from Hewitt and Liang (https://arxiv.org/pdf/1909.03368.pdf). We make a <i>control task</i> which is unrelated to the POS task and do the same probing procedure on the control task. We then measure the <i>selectivity</i> of layers; the difference between their probed accuracy on the POS task and on the control task. If a layer has learned substantial things about the POS task in particular, it should be much better at the POS task than the control task; i.e. it should have high selectivity.

Following Hewitt and Liang, we use the following control task for POS tagging. Each word identity will be assigned a random POS tag, with the distribution of POS tags weighted according to their actual appearance. Each word identity will always have the same tag every time it appears. We then train and test the layers on predicting this tag from the embedding. Note that this tag is a deterministic function of the word identity, so high selectivity means the embedding actually has forgotten something about the word identity.

How are the selectivity results different from the previous accuracy results? What does this tell us about the model's internal representations?


In [None]:
import random

vocabulary = set(
    word
      for sentence in (train_sentences + test_sentences)
      for word in sentence
)
# all_labels = sum((x.tolist() for x in train_labels), [])
all_labels = train_labels_all.tolist()
control_map = {word: random.choice(all_labels) for word in vocabulary}

control_train_labels = [torch.tensor([control_map[word] for word in sentence]) for sentence in train_sentences]
control_test_labels = [torch.tensor([control_map[word] for word in sentence]) for sentence in test_sentences]
control_train_labels = torch.tensor(np.concatenate(control_train_labels, 0)).to(device)
control_test_labels = torch.tensor(np.concatenate(control_test_labels, 0)).to(device)


In [None]:
num_layers = len(train_representations_all)
control_train_accs, control_test_accs = [], []
for l in range(num_layers):
    classifier, criterion, optimizer = build_nonlinear_classifier(emb_dim, num_labels, device)
    # get layer representation
    train_representations = train_representations_all[l]
    test_representations = test_representations_all[l]

    # train
    train_loss, train_accuracy = train(2, train_representations, control_train_labels,
          model, tokenizer, sep, model_name, device,
          classifier, criterion, optimizer)
    control_train_accs.append(train_accuracy)
    # test
    test_loss, test_accuracy = evaluate(test_representations, control_test_labels,
         model, tokenizer, sep, model_name, device,
         classifier, criterion)
    control_test_accs.append(test_accuracy)
    print("layer: {}, train accuracy: {}, test accuracy: {}".format(l, train_accuracy, test_accuracy))

layer: 0, train accuracy: 0.860223020783166, test accuracy: 0.8951791242812914
layer: 1, train accuracy: 0.8554069573593154, test accuracy: 0.8834586466165414
layer: 2, train accuracy: 0.8440706849924055, test accuracy: 0.8761609907120743
layer: 3, train accuracy: 0.832956692475827, test accuracy: 0.8735072976559045
layer: 4, train accuracy: 0.8150631645241359, test accuracy: 0.8509509066784608
layer: 5, train accuracy: 0.7936872522505835, test accuracy: 0.8321539141972578
layer: 6, train accuracy: 0.769273515355833, test accuracy: 0.8018575851393189
layer: 7, train accuracy: 0.7430074463749861, test accuracy: 0.7801857585139319
layer: 8, train accuracy: 0.713221946430556, test accuracy: 0.7463511720477665
layer: 9, train accuracy: 0.6814359278331419, test accuracy: 0.7016806722689075
layer: 10, train accuracy: 0.6490942096098988, test accuracy: 0.6700574966828837
layer: 11, train accuracy: 0.6296817693476087, test accuracy: 0.6306943830163645
layer: 12, train accuracy: 0.5910421220316

In [None]:
for l in range(num_layers):
    print("layer: {}, test selectivity: {}".format(l, test_accs[l] - control_test_accs[l]))

layer: 0, test selectivity: -0.046660769570986305
layer: 1, test selectivity: -0.02277753206545785
layer: 2, test selectivity: 0.051747014595311835
layer: 3, test selectivity: 0.054621848739495715
layer: 4, test selectivity: 0.07938965059708092
layer: 5, test selectivity: 0.09597523219814241
layer: 6, test selectivity: 0.1318000884564352
layer: 7, test selectivity: 0.15590446704997785
layer: 8, test selectivity: 0.18443166740380368
layer: 9, test selectivity: 0.22777532065457762
layer: 10, test selectivity: 0.24701459531180892
layer: 11, test selectivity: 0.26028306059265804
layer: 12, test selectivity: 0.29301194161875277
