In [1]:
import os
import torch
import torch.nn.functional as F
from pytorch_transformers import BertTokenizer, cached_path
from training.transformer_utils.model import TransformerWithClfHeadAndAdapters

#### Load model and config dicts

In [2]:
model_path = "models/transformer"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
config = torch.load(cached_path(os.path.join(model_path, "model_training_args.bin")))
model = TransformerWithClfHeadAndAdapters(config["config"],
                                          config["config_ft"]).to(device)
state_dict = torch.load(cached_path(os.path.join(model_path, "model_weights.pth")),
                        map_location=device)

model.load_state_dict(state_dict)   # Load model state dict
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)  # Load tokenizer

#### Define special tokens

In [3]:
clf_token = tokenizer.vocab['[CLS]']  # classifier token
pad_token = tokenizer.vocab['[PAD]']  # pad token

#### Show maximum sequence length for trained model

In [4]:
max_length = config['config'].num_max_positions  # Max length from trained model
max_length

256

#### Text example

In [5]:
def encode(inputs):
    # Encode text as IDs using the BertTokenizer
    return list(tokenizer.convert_tokens_to_ids(o) for o in inputs)

In [6]:
text = "Effective but too-tepid biopic."
text

'Effective but too-tepid biopic.'

This step converts the text's tokens into IDs that can be used for model training/inference.

In [7]:
inputs = tokenizer.tokenize(text)
if len(inputs) >= max_length:
    inputs = inputs[:max_length - 1]
ids = encode(inputs) + [clf_token]
print(inputs)
print(ids)

['Effect', '##ive', 'but', 'too', '-', 'te', '##pid', 'bio', '##pic', '.']
[27007, 2109, 1133, 1315, 118, 21359, 25786, 25128, 20437, 119, 101]


#### Evaluate
When evaluating the model against a test sample, dropout and gradient backpropagation must be disabled.

In [8]:
model.eval(); # Disable dropout

In [9]:
with torch.no_grad():   # Disable backprop
    tensor = torch.tensor(ids, dtype=torch.long).to(device)
    tensor_reshaped = tensor.reshape(1, -1)
    tensor_in = tensor_reshaped.transpose(0, 1).contiguous()  # to shape [seq length, 1]
    logits = model(tensor_in,
                   clf_tokens_mask=(tensor_in == clf_token),
                   padding_mask=(tensor_reshaped == pad_token))

The tensor inputs to the model are reshaped and the appropriate classification token masks and padding masks applied.

In [10]:
print("ids in tensor form: ", tensor)
print("tensor reshaped: ", tensor_reshaped)
print("tensor input to model: ", tensor_in)

ids in tensor form:  tensor([27007,  2109,  1133,  1315,   118, 21359, 25786, 25128, 20437,   119,
          101])
tensor reshaped:  tensor([[27007,  2109,  1133,  1315,   118, 21359, 25786, 25128, 20437,   119,
           101]])
tensor input to model:  tensor([[27007],
        [ 2109],
        [ 1133],
        [ 1315],
        [  118],
        [21359],
        [25786],
        [25128],
        [20437],
        [  119],
        [  101]])


#### Convert logits to class probabilities
The logits are input to a softmax function, detached from the computation graph, and converted to probabilities stored in a numpy array.

In [11]:
val, _ = torch.max(logits, 0)
val = F.softmax(val, dim=0).detach().cpu().numpy()

In [12]:
print("Raw logits: ", logits, type(logits))
print("Class probabilities: ", val, type(val))

Raw logits:  tensor([[-0.4908,  2.7907,  2.2662, -0.6466, -3.1275]]) <class 'torch.Tensor'>
Class probabilities:  [0.02257462 0.60087246 0.3556181  0.0193185  0.00161635] <class 'numpy.ndarray'>


#### Convert to class label

To train the transformer in PyTorch we zero-indexed the labels.
Now we increment the predicted most likely label by 1 to match with the original class label definition for SST-5.

In [13]:
pred = int(val.argmax()) + 1
print("Class prediction for text example: ", pred)

Class prediction for text example:  2
