## Setup 

In [1]:
%load_ext autoreload
%autoreload 2
%load_ext line_profiler

In [2]:
## Imports and environment variables 
import os
os.environ["TOKENIZERS_PARALLELISM"]  = "true"  # set to false if not working

# Core imports 
import torch, numpy as np, pandas as pd, gc,sys, logging, warnings
from torch.utils.data import DataLoader, RandomSampler
from torch.distributions import Categorical
from datasets import load_dataset, load_metric, load_from_disk, DatasetDict
from transformers import (AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, 
                          AutoTokenizer, AdamW, SchedulerType, get_scheduler)
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import pytorch_cos_sim
from collections import defaultdict
from accelerate import Accelerator, notebook_launcher
from cachetools import cached, LRUCache
from types import MethodType
from timeit import default_timer as timer
import utils; from utils import *   # local script 
from tqdm.auto import tqdm
import itertools
import copy 
import wandb
from undecorated import undecorated


# Dev imports (not needed for final script)
import seaborn as sns
from IPython.display import Markdown
from pprint import pprint
from IPython.core.debugger import set_trace
from GPUtil import showUtilization
import torchsnooper

In [3]:
logging.basicConfig(format='%(message)s') 
logger = logging.getLogger("main_logger")
logger.setLevel(logging.INFO)

In [285]:
# options for the pp_model 
# 1. tuner007/pegasus_paraphrase
# 2. tdopierre/ProtAugment-ParaphraseGenerator
# 3. eugenesiow/bart-paraphrase

## PEGASUS model
pp_name = "tuner007/pegasus_paraphrase"
pp_tokenizer_pegasus = AutoTokenizer.from_pretrained(pp_name)
pp_model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(pp_name, local_files_only=True)
generate_with_grad = undecorated(pp_model_pegasus.generate)
pp_model_pegasus.generate_with_grad = MethodType(generate_with_grad, pp_model_pegasus)

## BART model
pp_name = "eugenesiow/bart-paraphrase"
pp_tokenizer_bart = AutoTokenizer.from_pretrained(pp_name)
pp_model_bart = AutoModelForSeq2SeqLM.from_pretrained(pp_name, local_files_only=True)
generate_with_grad = undecorated(pp_model_bart.generate)
pp_model_bart.generate_with_grad = MethodType(generate_with_grad, pp_model_bart)

## Select which one to use as default
pp_tokenizer = pp_tokenizer_pegasus
pp_model = pp_model_pegasus

## Functions 

In [292]:
def get_pp_logp(translated): 
    """log(p(pp|orig)) basically.
    works for greedy search, will need tweaking for other types probably"""
    seq_without_first_tkn = translated.sequences[:, 1:]
    attention_mask = pp_model._prepare_attention_mask_for_generation(
        seq_without_first_tkn, pp_tokenizer.pad_token_id, pp_tokenizer.eos_token_id
    )
    scores_log_softmax = torch.stack(translated.scores, 1).log_softmax(2)
    seq_token_log_probs = torch.gather(scores_log_softmax,2,seq_without_first_tkn[:,:,None]).squeeze(-1)
    del scores_log_softmax
    # account for nan values by setting them to 0 (maybe a bit of a hack)
    # will also handle inf and -inf values too by default
    seq_token_log_probs = torch.nan_to_num(seq_token_log_probs)
    # account for the padding tokens at the end 
    seq_token_log_probs = seq_token_log_probs * attention_mask
    seq_log_prob = seq_token_log_probs.sum(-1)
#     if np.any(np.isnan(seq_log_prob.detach().cpu()).tolist()): 
#         warnings.warn(f"Warning: NAN's detected in pp_logp calclulations.\n seq_token_log_probs: {seq_token_log_probs}")
    return seq_log_prob

def get_tokens_from_token_ids_batch(tokenizer, ids_batch):
    l = []
    for i in range(ids_batch.shape[0]): 
        l.append(tokenizer.convert_ids_to_tokens(ids_batch[i,:]))
    return l

def assert_start_and_end_tokens_are_correct(tokenizer, orig_token_ids, pp_token_ids):
    """Make sure input sequences (orig) and output sequences (pp) start and end with the 
    right special tokens (depends on tokenizer)"""
    def get_start_end_special_token_ids(tokenizer): 
        """The token id's that input/output sequences should start and end with"""
        d = {}
        if pp_tokenizer.name_or_path in ['eugenesiow/bart-paraphrase', 'tdopierre/ProtAugment-ParaphraseGenerator']: 
            d["input_start_id"] =  tokenizer.bos_token_id
            d["input_end_id"] =  [tokenizer.pad_token_id, tokenizer.eos_token_id]
            d["output_start_id"] =  tokenizer.eos_token_id 
            d["output_end_id"] =  [tokenizer.pad_token_id, tokenizer.eos_token_id]
        elif pp_tokenizer.name_or_path == "tuner007/pegasus_paraphrase":
            d["input_start_id"] =  None
            d["input_end_id"] =  [tokenizer.pad_token_id, tokenizer.eos_token_id] 
            d["output_start_id"] =  tokenizer.pad_token_id
            d["output_end_id"] =  [tokenizer.pad_token_id, tokenizer.eos_token_id]
        else: 
            raise Exception("unrecognised tokenizer")
        return d
    start_end_token_d = get_start_end_special_token_ids(pp_tokenizer)
    
    # Input
    if start_end_token_d['input_start_id'] is not None: 
        assert torch.all(orig_token_ids[:,0] == start_end_token_d['input_start_id'])
    # can probs rewrite this to make it nicer but it's fine for now
    assert torch.all(torch.logical_or(orig_token_ids[:,-1] == start_end_token_d['input_end_id'][0], 
                                      orig_token_ids[:,-1] == start_end_token_d['input_end_id'][1]))
    
    # Output
    assert torch.all(pp_token_ids[:,0] == start_end_token_d['output_start_id'])
    assert torch.all(torch.logical_or(pp_token_ids[:,-1] == start_end_token_d['output_end_id'][0], 
                                      pp_token_ids[:,-1] == start_end_token_d['output_end_id'][1]))

def check_scores_for_posinf_nan_and_unexpected_neginf(scores_stacked): 
    """Check we don't have any postive inf or nan, and that all negative inf values are expected"""
    idx_posinf = torch.nonzero(torch.isposinf(scores_stacked))
    idx_neginf = torch.nonzero(torch.isneginf(scores_stacked))
    idx_nan    = torch.nonzero(torch.isnan(scores_stacked))
    
    # Check we don't get any positive inf or nan
    assert len(idx_posinf) == 0
    assert len(idx_nan)    == 0 
    
    # We expect to see negative inf for the eos_token when we have not reached min_length. 
    # But we shouldn't expect it for any other tokens
    assert torch.all(idx_neginf[:,2] == pp_tokenizer.eos_token_id)
    # Rough check that all idx before min_length are -inf for all elements in batch
    # We do min_length - 1 because sequences are allowed to have length min_length so that idx 
    # shouldn't be set to -inf
    # Not a 100% test but very likely to identify
    assert idx_neginf.shape[0] == (pp_model_params["min_length"] -1) * batch_size  
    # Check that no elements after min_length are -inf
    assert torch.all(idx_neginf[:,1] < (pp_model_params["min_length"] -1 ))

def check_scores_log_softmax_sums_and_shape(scores_log_softmax):
    sums = scores_log_softmax.exp().sum(2)
    # check that the axes is right
    # we want to sum over token probabilities at each generation step, so we 
    # should end up with a shape [batch_size, generated_length]
    assert sums.shape[0] == batch_size  
    assert sums.shape[1] == generated_length - 1
    # check that they sum to 1 along the generated_length axis
    assert torch.allclose(sums, torch.ones(sums.size()), atol = 1e-4)

## Generating paraphrases

In [318]:
#### INPUT #####
orig_l = [
    "Hello my name is zfldlfoqd", 
    "The cat is brown and it looks cute!"
]
batch_size = len(orig_l)
orig_tokens = pp_tokenizer(orig_l, return_tensors='pt', padding=True, pad_to_multiple_of=4)
input_length = orig_tokens['input_ids'].size()[1]
orig_l_tokens_list = get_tokens_from_token_ids_batch(pp_tokenizer, orig_tokens['input_ids'])


##### PARAPHRASE #####
pp_model_params = {
    "num_beams": 1, 
    "num_return_sequences": 1, 
    "num_beam_groups": 1, 
    "diversity_penalty": 0.,   # must be a float
    "temperature": 1.5,
    "length_penalty" : 1,
    "min_length" : 5
}
pp_output = pp_model.generate_with_grad(**orig_tokens, **pp_model_params, do_sample=False, 
                                      return_dict_in_generate=True,
                                      output_scores=True,
                                    remove_invalid_values=False)
generated_length = pp_output.sequences.shape[1]
pp_l             = pp_tokenizer.batch_decode(pp_output.sequences, skip_special_tokens=True)
pp_l_with_tokens = pp_tokenizer.batch_decode(pp_output.sequences, skip_special_tokens=False)
pp_l_list_of_tokens = pp_tokenizer.convert_ids_to_tokens(pp_output.sequences[0,:])
pp_l_tokens_list = get_tokens_from_token_ids_batch(pp_tokenizer, pp_output.sequences)


assert_start_and_end_tokens_are_correct(pp_tokenizer, orig_token_ids=orig_tokens['input_ids'],
                                        pp_token_ids= pp_output.sequences)



###### SCORES ########
scores_stacked = torch.stack(pp_output.scores, 1)
# The second argument to stack (i.e. dim) determines which axis the tensors are stacked along. 
# It determines the axis that becomes generated_length - 1
# dim=0 gives shape [generated_length-1, batch_size, vocab_size]
# dim=1 gives shape [batch_size, generated_length-1, vocab_size]
# dim=2 gives shape [batch_size, vocab_size, generated_length-1]
# Our scores_stacked is stacked on dim 1 so it should be second 
assert scores_stacked.shape == torch.Size([batch_size, (generated_length - 1), pp_tokenizer.vocab_size])
check_scores_for_posinf_nan_and_unexpected_neginf(scores_stacked)


# These scores are logits 
# see some of the docs on this page https://huggingface.co/docs/transformers/v4.16.2/en/main_classes/output#transformers.modeling_outputs.Seq2SeqModelOutput
# so we got to take softmax over them 
# but if we take regular softmax then we run into numerical errors
# so instead we take log_softmax
scores_log_softmax = torch.log_softmax(scores_stacked, 2)
check_scores_log_softmax_sums_and_shape(scores_log_softmax)


# Entropy and similar stats 
ent = Categorical(logits = scores_stacked).entropy()
assert ent.shape == torch.Size([batch_size, generated_length - 1])
scores_softmax = scores_log_softmax.exp()
k=3
tkn_kmaxprob, tkn_kmaxidx = torch.topk(scores_softmax, k=k, dim=2)
# The third dimension indexes top1, top2, top3 etc 
assert tkn_kmaxprob[:,:,0].shape == torch.Size([batch_size, generated_length - 1])
# I'd naively expect True everywhere for tkn_kmaxidx[:,:,0] == pp_output.sequences[:, 1:] but it turns 
# out this is not the case because padding tokens seem to have prob 0 and eos tokens are outputted 
# instead by the token generation process and then later replaced by pad


In [320]:
get_tokens_from_token_ids_batch(tkn_kmaxidx[:,:,0])

TypeError: get_tokens_from_token_ids_batch() missing 1 required positional argument: 'ids_batch'

In [317]:
torch.topk(scores_softmax, k=k, dim=2).values.shape
# [batch_size, generated_length-1, k]

torch.Size([2, 11, 3])

In [311]:
tkn_kmaxprob[0]

tensor([[2.8896e-01, 2.6108e-01, 6.9979e-02],
        [9.3538e-01, 4.9477e-03, 1.6081e-03],
        [8.3225e-01, 1.7378e-02, 3.8617e-03],
        [3.8864e-01, 1.8792e-02, 8.6064e-03],
        [3.5333e-01, 3.8666e-02, 1.8665e-02],
        [6.3073e-01, 1.9265e-02, 1.1664e-02],
        [8.0162e-01, 3.7079e-02, 7.5843e-03],
        [8.2441e-01, 4.3555e-03, 2.6471e-03],
        [8.6826e-01, 3.0741e-02, 7.2997e-03],
        [4.7315e-01, 1.6260e-01, 1.4396e-01],
        [9.0402e-01, 3.6067e-04, 8.6157e-05]], grad_fn=<SelectBackward0>)

In [321]:
get_tokens_from_token_ids_batch(pp_tokenizer, tkn_kmaxidx[:,:,0])

[['▁My', '▁name', '▁is', '▁Z', 'FL', 'dl', 'fo', 'q', 'd', '.', '</s>'],
 ['▁The',
  '▁cat',
  '▁is',
  '▁brown',
  '.',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>',
  '</s>']]

In [322]:
get_tokens_from_token_ids_batch(pp_tokenizer, tkn_kmaxidx[:,:,1])

[['▁I', '▁Name', "'", '▁z', 'fl', 'DL', 'FO', '</s>', 'D', '</s>', '.'],
 ['▁A',
  '▁brown',
  '▁looks',
  '▁cute',
  '▁and',
  '.',
  '▁is',
  '▁looks',
  '▁is',
  '.',
  '.']]

In [323]:
get_tokens_from_token_ids_batch(pp_tokenizer, tkn_kmaxidx[:,:,2])

[['▁Hello', 'name', '▁was', 'Z', 'liff', 'D', '▁FO', '.', '.', '▁and', ','],
 ['▁There',
  '▁animal',
  '▁has',
  '▁adorable',
  '</s>',
  '▁',
  '.',
  '▁is',
  '▁looks',
  '▁the',
  '▁is']]

In [272]:
tkn_maxprob

tensor([[0.3364, 0.9830, 0.9765, 0.6290, 0.9962, 0.9999, 0.9977, 0.9974, 0.9993,
         0.9800, 0.5818, 0.1753],
        [0.9161, 0.9887, 0.9813, 0.9776, 0.9055, 0.7203, 0.9604, 0.8809, 0.9890,
         0.9279, 0.6757, 0.6300]], grad_fn=<MaxBackward0>)

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True],
        [ True,  True,  True,  True,  True,  True, False, False, False, False,
         False]])

tensor([[ 2387,   766,    16,   992,   506,  4779, 13491,   139,  1343,   417,
             4,     2],
        [  133,  4758,    16,  6219,     8,  1326, 11962,   328,     2,     1,
             1,     1]])

In [276]:
i = 0 
tmp = scores_log_softmax[i]
seq_without_first_tkn = pp_output.sequences[i, 1:]
logger.info("Now calculating sequence probabilities")
seq_token_probs = torch.gather(tmp,2,seq_without_first_tkn[:,:,None]).squeeze(-1)
seq_prob = seq_token_probs.prod(-1).item()
logger.info(f"Sequence probability: {seq_prob}")
# Get the 2nd and 3rd most likely tokens at each st
topk_ids = torch.topk(tmp,3,dim=2).indices[:,:,1:]
topk_tokens_probs = torch.gather(tmp,2,topk_ids).squeeze(-1)
toks2 = pp_tokenizer.convert_ids_to_tokens(topk_ids[:,:,0].squeeze())
toks3 = pp_tokenizer.convert_ids_to_tokens(topk_ids[:,:,1].squeeze())
tok_probs2 = topk_tokens_probs[:,:,0].squeeze()
tok_probs3 = topk_tokens_probs[:,:,1].squeeze()

logger.info(f"Probabilities of getting the top 3 tokens at each step:")
tokens = pp_tokenizer.convert_ids_to_tokens(seq_without_first_tkn.squeeze())
for (p, t, p2,t2,p3,t3)  in zip(seq_token_probs.squeeze(), tokens, tok_probs2, toks2, tok_probs3, toks3): 
    logger.info(f"{t}: {round(p.item(),3)}  {t2}: {round(p2.item(),3)}  {t3}: {round(p3.item(),3)}") 

Now calculating sequence probabilities


IndexError: too many indices for tensor of dimension 1

In [277]:
tmp

tensor([[-14.0859, -17.9504,     -inf,  ..., -18.2348, -18.0437, -16.5071],
        [-30.0271, -21.9822,     -inf,  ..., -21.9696, -21.9111, -20.4842],
        [-24.9215, -22.6132,     -inf,  ..., -22.1891, -21.9841, -24.1049],
        ...,
        [-19.5661, -22.5410,  -9.6751,  ..., -21.6367, -21.7456, -22.1351],
        [-18.2801, -17.6360,  -2.3842,  ..., -17.3905, -17.2129, -17.8064],
        [-17.3701, -17.1515,  -1.7413,  ..., -17.5883, -17.5943, -16.4318]],
       grad_fn=<SelectBackward0>)

In [271]:
logger.info("\n######################################################################\n")
logger.info(f"Tokenizer has these special tokens:{pp_tokenizer.all_special_tokens}")
logger.info(f"The bos token is {pp_tokenizer.bos_token} and has id {pp_tokenizer.bos_token_id}")
logger.info(f"The eos token is {pp_tokenizer.eos_token} and has id {pp_tokenizer.eos_token_id}")
logger.info(f"The pad token is {pp_tokenizer.pad_token} and has id {pp_tokenizer.pad_token_id}")
logger.info(f"The unk token is {pp_tokenizer.unk_token} and has id {pp_tokenizer.unk_token_id}")
logger.info("\n######################################################################\n")
logger.info(f"Original text: {orig_l}")
logger.info(f"This is tokenised to get a dict with keys input_ids and attention_mask ")
logger.info(f"Tokens:")
logger.info(f"The input_ids look like this: {orig_tokens['input_ids']}")
logger.info(f"The tokens are: {orig_l_tokens_list}")
logger.info(f"This has shape {orig_tokens['input_ids'].shape} or [batch_size, input_length], which also\
 might be padded to hit a padding multiple (so input_length is not just the longest example length in the batch).")
logger.info(f"The attention_mask looks like this: {orig_tokens['attention_mask']}")
logger.info(f"This has shape {orig_tokens['attention_mask'].shape} or [batch_size, input_length]")
logger.info("\n######################################################################\n")
logger.info(f"Paraphrases: {pp_l}")
logger.info(f"Paraphrases with special tokens: {pp_l_with_tokens}")
logger.info(f"List of pp tokens:{pp_l_tokens_list}")
logger.info(f"Paraphrase token sequences: {pp_output.sequences}")
logger.info(f"Shape of pp token sequences:{pp_output.sequences.shape} or [batch_size, generated_length]")
logger.info("Each pp should start with the bos_token and end with either eos token or pad token.")
logger.info("Writing assert to confirm this.")

logger.info("\n######################################################################\n")
logger.info(f"Scores is a tuple of length {len(pp_output.scores)} which is one less than the generated_length, or \
the number of tokens in the pp token sequences (this has shape {pp_output.sequences.shape}")
logger.info(f"Each score is a tensor of shape {pp_output.scores[0].shape} or [batch_size, vocab_size]")
#logger.info(f"Full shape:{[o.shape for o in pp_output.scores]}")
logger.info(f"We stack them to get a tensor of shape {scores_stacked.shape} or [batch_size, generated_length, vocab_size]")
logger.info(f"Scores are really logits so we have to take softmax to get probabilities.")


# seq_without_first_tkn = translated.sequences[:, 1:]
# logger.info("Now calculating sequence probabilities")
# seq_token_probs = torch.gather(scores_softmax,2,seq_without_first_tkn[:,:,None]).squeeze(-1)
# seq_prob = seq_token_probs.prod(-1).item()
# logger.info(f"Sequence probability: {seq_prob}")

# # Get the 2nd and 3rd most likely tokens at each st
# topk_ids = torch.topk(scores_softmax,3,dim=2).indices[:,:,1:]
# topk_tokens_probs = torch.gather(scores_softmax,2,topk_ids).squeeze(-1)
# toks2 = pp_tokenizer.convert_ids_to_tokens(topk_ids[:,:,0].squeeze())
# toks3 = pp_tokenizer.convert_ids_to_tokens(topk_ids[:,:,1].squeeze())
# tok_probs2 = topk_tokens_probs[:,:,0].squeeze()
# tok_probs3 = topk_tokens_probs[:,:,1].squeeze()

# logger.info(f"Probabilities of getting the top 3 tokens at each step:")
# tokens = pp_tokenizer.convert_ids_to_tokens(seq_without_first_tkn.squeeze())
# for (p, t, p2,t2,p3,t3)  in zip(seq_token_probs.squeeze(), tokens, tok_probs2, toks2, tok_probs3, toks3): 
#     logger.info(f"{t}: {round(p.item(),3)}  {t2}: {round(p2.item(),3)}  {t3}: {round(p3.item(),3)}") 



######################################################################

Tokenizer has these special tokens:['<s>', '</s>', '<unk>', '<pad>', '<mask>']
The bos token is <s> and has id 0
The eos token is </s> and has id 2
The pad token is <pad> and has id 1
The unk token is <unk> and has id 3

######################################################################

Original text: ['Hello my name is zfldlfoqd', 'The cat is brown and it looks cute!']
This is tokenised to get a dict with keys input_ids and attention_mask 
Tokens:
The input_ids look like this: tensor([[    0, 31414,   127,   766,    16,   992,   506,  4779, 13491,   139,
          1343,   417,     2,     1,     1,     1],
        [    0,   133,  4758,    16,  6219,     8,    24,  1326, 11962,   328,
             2,     1,     1,     1,     1,     1]])
The tokens are: [['<s>', 'Hello', 'Ġmy', 'Ġname', 'Ġis', 'Ġz', 'f', 'ld', 'lf', 'o', 'q', 'd', '</s>', '<pad>', '<pad>', '<pad>'], ['<s>', 'The', 'Ġcat', 'Ġis', 'Ġbrown', 'Ġand

In [127]:

def print_info_on_generated_text():
    """
        Prints a bunch of statistics around the generated text. Useful for debugging purposes.
        So far only works for greedy search.
    """

    tgt_text = pp_tokenizer.batch_decode(translated.sequences, skip_special_tokens=True)
    tgt_text_with_tokens = pp_tokenizer.batch_decode(translated.sequences, skip_special_tokens=False)
    logger.info(f"Generated text: {tgt_text}")
    logger.info(f"Generated text with special tokens: {tgt_text_with_tokens}")
    logger.info(f"Shape of translated.sequences:{translated.sequences.shape}")
    logger.info(f"translated.sequences:{translated.sequences}")
    logger.info(f"Scores is a tuple of length {len(translated.scores)} \
    and each score is a tensor of shape {translated.scores[0].shape}")
    scores_stacked = torch.stack(translated.scores, 1)
    logger.info(f"Stacking the scores into a tensor of shape {scores_stacked.shape}")
    scores_softmax = torch.softmax(scores_stacked, 2)
    logger.info(f"Now taking softmax. This shouldn't change the shape, but just to check,\
    its shape is {scores_softmax.shape}")
    probsums = scores_softmax.sum(axis=2)
    logger.info(f"These are probabilities now and so they should all sum to 1 (or close to it) in the axis \
    corresponding to each time step. We can check the sums here: {probsums}, but it's a long tensor \
    of shape {probsums.shape} and hard to see, so summing over all these values and removing 1 \
    from each gives {torch.sum(probsums - 1)} \
    which should be close to 0.")
    seq_without_first_tkn = translated.sequences[:, 1:]
    logger.info("Now calculating sequence probabilities")
    seq_token_probs = torch.gather(scores_softmax,2,seq_without_first_tkn[:,:,None]).squeeze(-1)
    seq_prob = seq_token_probs.prod(-1).item()
    logger.info(f"Sequence probability: {seq_prob}")

    # Get the 2nd and 3rd most likely tokens at each st
    topk_ids = torch.topk(scores_softmax,3,dim=2).indices[:,:,1:]
    topk_tokens_probs = torch.gather(scores_softmax,2,topk_ids).squeeze(-1)
    toks2 = pp_tokenizer.convert_ids_to_tokens(topk_ids[:,:,0].squeeze())
    toks3 = pp_tokenizer.convert_ids_to_tokens(topk_ids[:,:,1].squeeze())
    tok_probs2 = topk_tokens_probs[:,:,0].squeeze()
    tok_probs3 = topk_tokens_probs[:,:,1].squeeze()

    logger.info(f"Probabilities of getting the top 3 tokens at each step:")
    tokens = pp_tokenizer.convert_ids_to_tokens(seq_without_first_tkn.squeeze())
    for (p, t, p2,t2,p3,t3)  in zip(seq_token_probs.squeeze(), tokens, tok_probs2, toks2, tok_probs3, toks3): 
        logger.info(f"{t}: {round(p.item(),3)}  {t2}: {round(p2.item(),3)}  {t3}: {round(p3.item(),3)}") 

In [106]:
scores_stacked

tensor([[[  2.1281,  -1.7363,     -inf,  ...,  -2.0207,  -1.8297,  -0.2931],
         [-12.2530,  -4.2081,     -inf,  ...,  -4.1955,  -4.1370,  -2.7101],
         [ -7.8899,  -5.5816,     -inf,  ...,  -5.1575,  -4.9525,  -7.0733],
         ...,
         [ -0.6762,  -3.6511,   9.2148,  ...,  -2.7468,  -2.8557,  -3.2452],
         [ -5.2623,  -4.6181,  10.6337,  ...,  -4.3726,  -4.1950,  -4.7886],
         [ -4.3985,  -4.1799,  11.2302,  ...,  -4.6168,  -4.6227,  -3.4602]],

        [[ -3.3283,  -1.4830,     -inf,  ...,  -1.2559,  -1.7851,  -0.3878],
         [ -5.4642,  -3.7450,     -inf,  ...,  -3.5129,  -4.9730,  -2.3698],
         [ -1.4317,  -4.9811,     -inf,  ...,  -4.1592,  -5.8414,  -4.4424],
         ...,
         [ -0.0560,  -2.8219,  15.0593,  ...,  -3.0489,  -4.4363,  -2.9208],
         [  1.1428,  -3.6452,  12.2761,  ...,  -3.3208,  -3.9210,  -2.6603],
         [ -0.7912,  -3.7815,  11.9712,  ...,  -3.3675,  -3.8684,  -3.5179]]],
       grad_fn=<StackBackward0>)

## Tokenizer differences 

### Types 

Both the "eugenesiow/bart-paraphrase" model and the "tdopierre/ProtAugment-ParaphraseGenerator" are BART tokenizers and have type BartTokenizerFast. The implementation is identical to RobertaTokenizerFast according to the docs, which in turn was derived from GPT-2. They use byte-level Byte Pair Encoding.  

The "tuner007/pegasus_paraphrase" model is a Pegasus tokenizer has type PegasusTokenizerFast. This uses Unigram. 

### Tokenization differences

#### Spaces 

The BART tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not:

In [227]:
tokens = pp_tokenizer_bart(['hello there',' hello there'], return_tensors='pt')
get_tokens_from_token_ids_batch(pp_tokenizer_bart, tokens['input_ids'])

[['<s>', 'hello', 'Ġthere', '</s>'], ['<s>', 'Ġhello', 'Ġthere', '</s>']]

The Pegasus tokenizer doesn't do this

In [228]:
tokens = pp_tokenizer_pegasus(['hello there',' hello there'], return_tensors='pt')
get_tokens_from_token_ids_batch(pp_tokenizer_pegasus, tokens['input_ids'])

[['▁hello', '▁there', '</s>'], ['▁hello', '▁there', '</s>']]

### Representing tokens 

The tokenizers represent tokens differently.  
The BART models use Ġ to indicate start of a word for a token. Its generated tokens look like `['<s>', 'Hello', 'Ġmy', 'Ġname', 'Ġis', 'Ġz', 'f', 'ld', 'lf', 'o', 'q', 'd', '</s>', '<pad>', '<pad>', '<pad>']`  
The Pegasus model uses \_ to indicate start of a word for a token. Its generated tokens look like `['▁Hello', '▁my', '▁name', '▁is', '▁z', 'fl', 'dl', 'fo', 'q', 'd', '</s>', '<pad>']`

### Special tokens

#### BART

In [195]:
logger.info(f"Tokenizer has these special tokens:{pp_tokenizer_bart.all_special_tokens}")
logger.info(f"The bos token is {pp_tokenizer_bart.bos_token} and has id {pp_tokenizer_bart.bos_token_id}")
logger.info(f"The eos token is {pp_tokenizer_bart.eos_token} and has id {pp_tokenizer_bart.eos_token_id}")
logger.info(f"The pad token is {pp_tokenizer_bart.pad_token} and has id {pp_tokenizer_bart.pad_token_id}")
logger.info(f"The unk token is {pp_tokenizer_bart.unk_token} and has id {pp_tokenizer_bart.unk_token_id}")

Tokenizer has these special tokens:['<s>', '</s>', '<unk>', '<pad>', '<mask>']
The bos token is <s> and has id 0
The eos token is </s> and has id 2
The pad token is <pad> and has id 1
The unk token is <unk> and has id 3


#### PEGASUS

In [196]:
logger.info(f"Tokenizer has these special tokens:{pp_tokenizer_pegasus.all_special_tokens}")
logger.info(f"The bos token is {pp_tokenizer_pegasus.bos_token} and has id {pp_tokenizer_pegasus.bos_token_id}")
logger.info(f"The eos token is {pp_tokenizer_pegasus.eos_token} and has id {pp_tokenizer_pegasus.eos_token_id}")
logger.info(f"The pad token is {pp_tokenizer_pegasus.pad_token} and has id {pp_tokenizer_pegasus.pad_token_id}")
logger.info(f"The unk token is {pp_tokenizer_pegasus.unk_token} and has id {pp_tokenizer_pegasus.unk_token_id}")

Tokenizer has these special tokens:['</s>', '<unk>', '<pad>', '<mask_2>', '<mask_1>', '<unk_2>', '<unk_3>', '<unk_4>', '<unk_5>', '<unk_6>', '<unk_7>', '<unk_8>', '<unk_9>', '<unk_10>', '<unk_11>', '<unk_12>', '<unk_13>', '<unk_14>', '<unk_15>', '<unk_16>', '<unk_17>', '<unk_18>', '<unk_19>', '<unk_20>', '<unk_21>', '<unk_22>', '<unk_23>', '<unk_24>', '<unk_25>', '<unk_26>', '<unk_27>', '<unk_28>', '<unk_29>', '<unk_30>', '<unk_31>', '<unk_32>', '<unk_33>', '<unk_34>', '<unk_35>', '<unk_36>', '<unk_37>', '<unk_38>', '<unk_39>', '<unk_40>', '<unk_41>', '<unk_42>', '<unk_43>', '<unk_44>', '<unk_45>', '<unk_46>', '<unk_47>', '<unk_48>', '<unk_49>', '<unk_50>', '<unk_51>', '<unk_52>', '<unk_53>', '<unk_54>', '<unk_55>', '<unk_56>', '<unk_57>', '<unk_58>', '<unk_59>', '<unk_60>', '<unk_61>', '<unk_62>', '<unk_63>', '<unk_64>', '<unk_65>', '<unk_66>', '<unk_67>', '<unk_68>', '<unk_69>', '<unk_70>', '<unk_71>', '<unk_72>', '<unk_73>', '<unk_74>', '<unk_75>', '<unk_76>', '<unk_77>', '<unk_78>'

### Special token usage with input and output sequences 

#### BART 

They use this format 
```
single sequence: <s> X </s>
pair of sequences: <s> A </s></s> B </s>
```

#### PEGASUS

Format: 
```
- single sequence: ``X </s>``
- pair of sequences: ``A B </s>`` (not intended use)
```

BOS token is never used 

#### Differences 

 Tokenizers also use different tokens when representing input sequences and generating output sentences. Here is a quick summary:

In [190]:
tokens_d = {
    "bart": {
        "special_tokens": pp_tokenizer_bart.all_special_tokens, 
        "input_start": pp_tokenizer_bart.bos_token,
        "input_end": [pp_tokenizer_bart.pad_token, pp_tokenizer_bart.eos_token], 
        "output_start": pp_tokenizer_bart.eos_token, 
        "output_end": [pp_tokenizer_bart.pad_token, pp_tokenizer_bart.eos_token]
    }, 
    "pegasus": {
        "special_tokens": pp_tokenizer_pegasus.all_special_tokens,
        "input_start": None,
        "input_end": [pp_tokenizer_pegasus.pad_token, pp_tokenizer_pegasus.eos_token], 
        "output_start": pp_tokenizer_pegasus.pad_token,  
        "output_end": [pp_tokenizer_pegasus.pad_token, pp_tokenizer_pegasus.eos_token], 
    }
}
tokens_d

{'bart': {'special_tokens': ['<s>', '</s>', '<unk>', '<pad>', '<mask>'],
  'input_start': '<s>',
  'input_end': ['<pad>', '</s>'],
  'output_start': '</s>',
  'output_end': ['<pad>', '</s>']},
 'pegasus': {'special_tokens': ['</s>',
   '<unk>',
   '<pad>',
   '<mask_2>',
   '<mask_1>',
   '<unk_2>',
   '<unk_3>',
   '<unk_4>',
   '<unk_5>',
   '<unk_6>',
   '<unk_7>',
   '<unk_8>',
   '<unk_9>',
   '<unk_10>',
   '<unk_11>',
   '<unk_12>',
   '<unk_13>',
   '<unk_14>',
   '<unk_15>',
   '<unk_16>',
   '<unk_17>',
   '<unk_18>',
   '<unk_19>',
   '<unk_20>',
   '<unk_21>',
   '<unk_22>',
   '<unk_23>',
   '<unk_24>',
   '<unk_25>',
   '<unk_26>',
   '<unk_27>',
   '<unk_28>',
   '<unk_29>',
   '<unk_30>',
   '<unk_31>',
   '<unk_32>',
   '<unk_33>',
   '<unk_34>',
   '<unk_35>',
   '<unk_36>',
   '<unk_37>',
   '<unk_38>',
   '<unk_39>',
   '<unk_40>',
   '<unk_41>',
   '<unk_42>',
   '<unk_43>',
   '<unk_44>',
   '<unk_45>',
   '<unk_46>',
   '<unk_47>',
   '<unk_48>',
   '<unk_49>',
 

### Token indexing

In [198]:
def print_tokens_from_ids(tokenizer, start_id=100, end_id=200):
    ids = list(range(start_id,end_id))
    print(*list(zip(ids, tokenizer.convert_ids_to_tokens(ids))))

#### BART 

Having a look at generated tokens makes me suspect that they are indexed in whatever order they are encountered in the source text they are trained on. It seems like a rough frequency of english tokens but there are also tokens that are definitely out of order. 

The first few are reserved for special tokens, and the other low numbers (e.g. up to 100) are pretty common suffixes and words

In [199]:
print_tokens_from_ids(pp_tokenizer_bart, 0,50)

(0, '<s>') (1, '<pad>') (2, '</s>') (3, '<unk>') (4, '.') (5, 'Ġthe') (6, ',') (7, 'Ġto') (8, 'Ġand') (9, 'Ġof') (10, 'Ġa') (11, 'Ġin') (12, '-') (13, 'Ġfor') (14, 'Ġthat') (15, 'Ġon') (16, 'Ġis') (17, 'âĢ') (18, "'s") (19, 'Ġwith') (20, 'ĠThe') (21, 'Ġwas') (22, 'Ġ"') (23, 'Ġat') (24, 'Ġit') (25, 'Ġas') (26, 'Ġsaid') (27, 'Ļ') (28, 'Ġbe') (29, 's') (30, 'Ġby') (31, 'Ġfrom') (32, 'Ġare') (33, 'Ġhave') (34, 'Ġhas') (35, ':') (36, 'Ġ(') (37, 'Ġhe') (38, 'ĠI') (39, 'Ġhis') (40, 'Ġwill') (41, 'Ġan') (42, 'Ġthis') (43, ')') (44, 'ĠâĢ') (45, 'Ġnot') (46, 'Ŀ') (47, 'Ġyou') (48, 'ľ') (49, 'Ġtheir')


Looking at 100 to 200 you can see some words (e.g. Trump at 140, or 2017 at 193) that aren't common enough to be that high. This makes me suspect that words are in encounter order in the text. 

In [89]:
print_tokens_from_ids(pp_tokenizer_bart, 100,200)

(100, 'I') (101, 'Ġlike') (102, 'a') (103, 'Ġsome') (104, 'S') (105, 'Ã«') (106, 'Ġthem') (107, 'Ġyears') (108, "'") (109, 'Ġdo') (110, 'Ġyour') (111, 'Ġ-') (112, 'Ġ1') (113, '"') (114, 'Ġif') (115, 'Ġcould') (116, '?') (117, 'Ġno') (118, 'i') (119, 'm') (120, 'Ġget') (121, 'ĠU') (122, 'Ġnow') (123, 'Ġhim') (124, 'Ġback') (125, 'ĠBut') (126, 'ĠâĢĵ') (127, 'Ġmy') (128, "Ġ'") (129, 'Ġonly') (130, 'Ġthree') (131, ';') (132, 'Ġ2') (133, 'The') (134, '1') (135, 'Ġpercent') (136, 'Ġagainst') (137, 'Ġbefore') (138, 'Ġcompany') (139, 'o') (140, 'ĠTrump') (141, 'Ġhow') (142, 'Ġbecause') (143, 'Ġany') (144, 'Ġmost') (145, 'Ġbeing') (146, 'Ġmake') (147, 'Ġwhere') (148, 'Ġduring') (149, 'Ġthrough') (150, 'Ġwhile') (151, '000') (152, 'ĠThis') (153, 'Ġmillion') (154, 'ing') (155, 'Ġ3') (156, 'Ġmade') (157, 'Ġwell') (158, 'Ġ10') (159, 'Ġdown') (160, 'Ġoff') (161, 'Ġsays') (162, 'Ġme') (163, 'ĠB') (164, 'Ġgoing') (165, 'Ġteam') (166, 'ĠWe') (167, 'Ġthose') (168, 'Ġgovernment') (169, 'Ġway') (170, 'We'

Tokens towards the end are gibberish or mispellings encountered in the input. The fifth last token is something labelled <|endoftext|> and I don't know what that is. Then there is a bunch of tokens like "madeupword0001". The last token is the mask token and then token indicies after that return None. 

In [95]:
print_tokens_from_ids(pp_tokenizer_bart, pp_tokenizer_bart.vocab_size-20, pp_tokenizer_bart.vocab_size+10)

(50245, 'ĠSetTextColor') (50246, 'Ġfixme') (50247, 'ĠãĤµãĥ¼ãĥĨãĤ£') (50248, 'ĠãĤµãĥ¼ãĥĨãĤ£ãĥ¯ãĥ³') (50249, 'ĠÂłĠÂłĠÂłĠÂłĠÂłĠÂłĠÂłĠÂł') (50250, 'ĠAdinida') (50251, 'ItemTracker') (50252, 'ĠDevOnline') (50253, 'ĠÂłÂł') (50254, '<?') (50255, '*=-') (50256, 'ÃĽÃĽ') (50257, 'ĠEntityItem') (50258, 'EngineDebug') (50259, 'ĠstrutConnector') (50260, '<|endoftext|>') (50261, 'madeupword0000') (50262, 'madeupword0001') (50263, 'madeupword0002') (50264, '<mask>') (50265, None) (50266, None) (50267, None) (50268, None) (50269, None) (50270, None) (50271, None) (50272, None) (50273, None) (50274, None)


#### PEGASUS

Special tokens make up the first hundred or so. After that there's a token \<n> that seems like some new line thing. 

In [203]:
print_tokens_from_ids(pp_tokenizer_pegasus, 0,120)

(0, '<pad>') (1, '</s>') (2, '<mask_1>') (3, '<mask_2>') (4, '<unk_2>') (5, '<unk_3>') (6, '<unk_4>') (7, '<unk_5>') (8, '<unk_6>') (9, '<unk_7>') (10, '<unk_8>') (11, '<unk_9>') (12, '<unk_10>') (13, '<unk_11>') (14, '<unk_12>') (15, '<unk_13>') (16, '<unk_14>') (17, '<unk_15>') (18, '<unk_16>') (19, '<unk_17>') (20, '<unk_18>') (21, '<unk_19>') (22, '<unk_20>') (23, '<unk_21>') (24, '<unk_22>') (25, '<unk_23>') (26, '<unk_24>') (27, '<unk_25>') (28, '<unk_26>') (29, '<unk_27>') (30, '<unk_28>') (31, '<unk_29>') (32, '<unk_30>') (33, '<unk_31>') (34, '<unk_32>') (35, '<unk_33>') (36, '<unk_34>') (37, '<unk_35>') (38, '<unk_36>') (39, '<unk_37>') (40, '<unk_38>') (41, '<unk_39>') (42, '<unk_40>') (43, '<unk_41>') (44, '<unk_42>') (45, '<unk_43>') (46, '<unk_44>') (47, '<unk_45>') (48, '<unk_46>') (49, '<unk_47>') (50, '<unk_48>') (51, '<unk_49>') (52, '<unk_50>') (53, '<unk_51>') (54, '<unk_52>') (55, '<unk_53>') (56, '<unk_54>') (57, '<unk_55>') (58, '<unk_56>') (59, '<unk_57>') (60, 

Unlike the BART models I can believe that these tokens are in order of frequency. I can't see anything that is obviously out of place. 

In [205]:
print_tokens_from_ids(pp_tokenizer_pegasus, 120,250)

(400, '▁easy') (401, '...') (402, '▁‘') (403, '▁show') (404, '▁children') (405, '▁project') (406, '▁care') (407, '▁market') (408, '▁money') (409, '▁Our') (410, '▁book') (411, '▁change') (412, '▁So') (413, '▁To') (414, '▁put') (415, 'y') (416, '▁say') (417, 'You') (418, '▁room') (419, '▁got') (420, 'er') (421, '▁create') (422, '▁course') (423, '▁large') (424, '▁together') (425, '▁food') (426, '▁health') (427, '▁community') (428, '▁open') (429, '▁away') (430, '▁until') (431, '▁program') (432, '▁often') (433, '▁possible') (434, '▁When') (435, '▁again') (436, '▁All') (437, '▁case') (438, '▁page') (439, '▁car') (440, '▁real') (441, '▁With') (442, '▁name') (443, '▁call') (444, '▁include') (445, 'ly') (446, '▁per') (447, '▁why') (448, '▁product') (449, '▁state') (450, '▁post') (451, '▁based') (452, '▁She') (453, '▁second') (454, 'n') (455, '▁event') (456, '▁group') (457, 'i') (458, '▁having') (459, '▁old') (460, '▁become') (461, '▁big') (462, '▁play') (463, '▁What') (464, '▁against') (465, '▁

There's nothing special at the end, just looks like isolated tokens and None values after the tokens finish. It's also worth noting that the Pegasus model has ~96100 tokens which is way more than the ~50270 of the BART models (almost double). 

In [207]:
print_tokens_from_ids(pp_tokenizer_pegasus, pp_tokenizer_pegasus.vocab_size-20, pp_tokenizer_pegasus.vocab_size+10)

(96083, '0:09') (96084, '▁Pietra') (96085, 'webhost') (96086, '8:48') (96087, '▁psychoanalytic') (96088, '1335') (96089, '▁Happies') (96090, '▁Tamale') (96091, '▁Seidel') (96092, '▁Muppet') (96093, '▁Quota') (96094, '▁polyphenol') (96095, 'utyrate') (96096, 'saari') (96097, '▁WASTE') (96098, '▁$6,500') (96099, '.06%') (96100, 'constitutional') (96101, '▁$6.4') (96102, 'ospermum') (96103, None) (96104, None) (96105, None) (96106, None) (96107, None) (96108, None) (96109, None) (96110, None) (96111, None) (96112, None)


## Questions 

### When does the model generate padding tokens?

For both models padding tokens are generated after the EOS token. Additionally for Pegasus generated text starts with the padding token. 

### Why do generated paraphrases start with the EOS token? 

This is only the case for BART models. For pegasus models they use a padding token to start generated paraphrases. 

I don't know what the BOS token isn't used for these things. Pegasus has an open issue [here](https://github.com/huggingface/transformers/issues/12474). 

Whatever the reason you should just do the default because that is what the preprocessing does and you will get the best results that way. 

### Does p(PAD) =1 after an eos token?

For both BART and Pegasus models it appears that probability of outputting a pad token is actually zero at all timesteps. Instead the model outputs the eos token over and over, and there must be some post-processing that takes place that replaces eos token with padding token. 
For Pegasus it appears it is the same behaviour. 
Example code: 

In [298]:
print(round_t(scores_softmax[:,:,pp_tokenizer.eos_token_id]))
print(round_t(scores_softmax[:,:,pp_tokenizer.pad_token_id]))

[[0.   0.   0.   0.   0.   0.   0.   0.   0.   0.16 0.9 ]
 [0.   0.   0.   0.   0.13 0.9  0.81 0.51 0.74 0.71 0.78]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


What is interesting is that there is probability assigned to other tokens other than eos and pad after a eos token is outputted. Again there must be some kind of postprocessing that takes care of this situation because I haven't really seen it in the wild. 

Some models (e.g. GPT2) don't even have a PAD token. Instead they use the eos token on repeat. See this [issue](https://github.com/huggingface/transformers/issues/8452#issuecomment-739008168). What is confusing is seeing this behaviour with models that have a padding token. 

### Do rows always sum to 1 when looking at token generation scores? 


Yes, they should. I put in an assert to check this. 
If you have a nan or an inf then they won't sum to 1. To confirm this

In [61]:
print(torch.isnan(torch.sum(torch.tensor([1,2,3, torch.nan]))))
print(torch.isinf(torch.sum(torch.tensor([1,2,3, torch.inf]))))

tensor(True)
tensor(True)


### Does first row sum to 0? (the one corresponding to the startoff token)


So there is no token scores that correspond to the first token (usually a bos or pad token). The scores are a tuple of length (`generated_length - 1`). So there shouldn't be a "zero" row really. 
I remember seeing something like this at some point so I'll keep an eye out for it. 

### How are logits containing nan or inf transformed with softmax and log_softmax?

We can explore this through some code examples.

#### Vanilla case  
First we look at the case without any nan or inf. 

In [144]:
logits = torch.tensor([1.4, -1, 3, 2])
print(logits)
print(torch.softmax(logits,0))
print(torch.log_softmax(logits,0))

tensor([ 1.4000, -1.0000,  3.0000,  2.0000])
tensor([0.1271, 0.0115, 0.6297, 0.2316])
tensor([-2.0625, -4.4625, -0.4625, -1.4625])


The softmax values are interpreted as probabilities, and the log softmax is just the log of the probabilities, done for numerical stability. We can just take exponents to return to probabilities if needed. 

In [146]:
print(torch.log_softmax(logits,0).exp())

tensor([0.1271, 0.0115, 0.6297, 0.2316])


#### Positive inf

Now let's see what happens if we introduce a positive inf. 

In [148]:
logits = torch.tensor([1.4, -1, 3, 2, torch.inf])
print(logits)
print(torch.softmax(logits,0))
print(torch.log_softmax(logits,0))

tensor([ 1.4000, -1.0000,  3.0000,  2.0000,     inf])
tensor([nan, nan, nan, nan, nan])
tensor([nan, nan, nan, nan, nan])


We get nan values in the softmax and log_softmax.  So if you see nans in the softmax, remember that an inf in the scores is one reason why it may happen.  

This is interesting because if we just assume inf is a large positive number, we'd expect a softmax with basically a 1 and all zeros, and a log softmax of a 0 and a lot of negatives. We can try it here: 

In [150]:
logits = torch.tensor([1.4, -1, 3, 2, 10000000000])
print(logits)
print(torch.softmax(logits,0))
print(torch.log_softmax(logits,0))

tensor([ 1.4000e+00, -1.0000e+00,  3.0000e+00,  2.0000e+00,  1.0000e+10])
tensor([0., 0., 0., 0., 1.])
tensor([-1.0000e+10, -1.0000e+10, -1.0000e+10, -1.0000e+10,  0.0000e+00])


Basically what we get. So this indicates that if we get a positive inf we might be able to mitigate this problem by clipping it to some kind of maximum value. 

#### Negative inf 

In [151]:
logits = torch.tensor([1.4, -1, 3, 2, -torch.inf])
print(logits)
print(torch.softmax(logits,0))
print(torch.log_softmax(logits,0))

tensor([ 1.4000, -1.0000,  3.0000,  2.0000,    -inf])
tensor([0.1271, 0.0115, 0.6297, 0.2316, 0.0000])
tensor([-2.0625, -4.4625, -0.4625, -1.4625,    -inf])


Negative inf behaves a bit differently. The softmax is unaffected and basically just assigns a prob of 0 to the corresponding entry. The log softmax carries the `-inf` through. 

Again clipping the -inf to a large negative value can mitigate this problem somewhat: 

In [153]:
logits = torch.tensor([1.4, -1, 3, 2, -10000000])
print(logits)
print(torch.softmax(logits,0))
print(torch.log_softmax(logits,0))

tensor([ 1.4000e+00, -1.0000e+00,  3.0000e+00,  2.0000e+00, -1.0000e+07])
tensor([0.1271, 0.0115, 0.6297, 0.2316, 0.0000])
tensor([-2.0625e+00, -4.4625e+00, -4.6253e-01, -1.4625e+00, -1.0000e+07])


#### nan values 

In [154]:
logits = torch.tensor([1.4, -1, 3, 2, torch.nan])
print(logits)
print(torch.softmax(logits,0))
print(torch.log_softmax(logits,0))

tensor([ 1.4000, -1.0000,  3.0000,  2.0000,     nan])
tensor([nan, nan, nan, nan, nan])
tensor([nan, nan, nan, nan, nan])


A nan in the logits propagates and affects the entire softmax and log_softmax tensors. The network basically gives up and says "no idea how to deal with this. 

This seems to be the case with most torch functions; e.g.  

In [159]:
print(torch.sum(logits,0))
print(torch.divide(logits,0.2))
print(logits + logits)

tensor(nan)
tensor([ 7., -5., 15., 10., nan])
tensor([ 2.8000, -2.0000,  6.0000,  4.0000,     nan])


### How do you interpret token entropy?

The token scores (when stacked) are a tensor of dimensions (batch_size, generated_length - 1, vocab_size). We take softmax to get a tensor of probability distributions across all possible tokens. We can also calculate the entropy of each of these probability distributions. 

Entropy is a measure of how "peaky" or "flat" a probability distribution is. It is the expected value of the self-information of an event, which is basically a measure of how "surprised" you would be if that event occured. 
If we have a discrete random variable $X$ with probability distribution $P(x) $, entropy is given by $$H(X) = \mathbb{E}_{ X\sim P} [I(x)] = -\mathbb{E}_{X \sim P} [\log P(x)] $$ which is practically calculated by $$H(X) =  -\sum_{x=-\infty}^\infty p(x) \log(p(x))$$

The lowest value of entropy is 0, which is when you have $P(x)=1$ for some event. 

In [82]:
Categorical(probs = torch.tensor([1,0,0,0])).entropy()

tensor(1.1921e-07)

High values of entropy occur when the probability distribution is very flat. 

In [85]:
print(Categorical(probs = torch.tensor([0.20,0.50,0.10,0.20])).entropy())  # spikier, lower entropy
print(Categorical(probs = torch.tensor([0.25,0.25,0.25,0.25])).entropy())  # flatter, higher entropy

tensor(1.2206)
tensor(1.3863)


In terms of tokens, we can show two realistic distributions below. The first has two likely tokens, one somewhat likely, and the rest unlikely. The second has many more likely tokens. 

In [93]:
l1 = [0.001,0.001,0.001,0.001,0.5,0.001,0.4,0.001,0.001,0.001,0.001,0.001,0.09]
l2 = [0.1,0.1,0.1,0.1,0.025,0.1,0.1,0.1,0.1,0.1,0.025,0.025,0.025]
print(Categorical(probs = torch.tensor(l1)).entropy())  # spikier, lower entropy
print(Categorical(probs = torch.tensor(l2)).entropy())  # flatter, higher entropy

tensor(0.9989)
tensor(2.4412)


There isn't a theoretical maximum for entropy, but for tokens you'll be governed by vocab size. Here we show some practical maximums for some different vocab sizes

In [105]:
v_size = 1000
print(Categorical(probs = torch.tensor([1/v_size for i in range(v_size)])).entropy())  # v small vocab
v_size = 10000
print(Categorical(probs = torch.tensor([1/v_size for i in range(v_size)])).entropy())  # small vocab
v_size = 50000
print(Categorical(probs = torch.tensor([1/v_size for i in range(v_size)])).entropy())  # ~BART vocab
v_size = 100000
print(Categorical(probs = torch.tensor([1/v_size for i in range(v_size)])).entropy())  # ~PEGASUS vocab

tensor(6.9078)
tensor(9.2103)
tensor(10.8198)
tensor(11.5129)


These can be a bit hard to interpret so maybe you should also look at some other token-level stats, like max_prob, second_max_prob, third_max_prob, mean, variance or other things like that. 

Some other things to note. First you still get an entropy value if your probability dist sums up to more than 1, so make sure to check this before doing entropy. Secondly if you have nan or inf in the probability values then you will get an error. This is true if you use either `probs` or `logits` in the Categorical function. 

In [112]:
print(Categorical(probs = torch.tensor([0.5,0.25,0.25,0.25])).entropy())       # sums to more than 1, gives result
#print(Categorical(probs = torch.tensor([0.5,0.25,0.25,torch.nan])).entropy())  # throws error
#print(Categorical(probs = torch.tensor([0.5,0.25,0.25,torch.inf])).entropy())  # throws error
#print(Categorical(logits = torch.tensor([0.5,0.25,0.25,torch.nan])).entropy())  # throws error

tensor(1.3322)


### How do you get nan and inf introduced into token scores?


My understanding is that you get -inf for the first (min_length - 1) steps when you introduce a min_length parameter for the generated sequences for the eos_token_id slot. This is to stop the token from appearing and truncating the sequence. 

You might also get -inf when setting other parameters to the `generate()` function (e.g. bad_words_ids or something similar).

Wrote an assert for this. 

Things to check 


* how do you get inf introduced into token scores?
* how big is the action space? 
  * initial estimate: on the order of vocab_size ^ sequence length (so bloody huge)
  * but only a very small proportion are valid actions
* how many pp has probs over: 1e-5, 1e-4, 1e-3, 1e-2, 1e-1. would be a good plot. x axis epoch, then ether do (a) for individual examples, or (b) as averages across examples
* log top X sampled sentences and their probs (maybe probs can be a graph)

* how does padding mask affect things?
* how does token-type-ids affect things? 
* which dist do you calculate KL divergence and entropy over? 
  * is it the token entropy’s at each generation step? 
  * is it the entropy of the generated paraphrase tokens? 
* given size of action space is this a good candidate for differential entropy? 
* when do you hit floating point threshold for token probabilities? When do nans and inf get introduced? 
* does using fp32 affect token calculations? 
* how does dropout affect generated probabilities? How does train/eval mode affect generated probs for a sentence? 
* how does layer-norm affect the probs? 

## Generation terms 

### Encoder_input_ids

In [232]:
### Decoder_input_ids

In [233]:
### token_type_ids

In [234]:
### attention_mask

In [235]:
### input_ids

In [None]:
### decoder_start_token_id

## Misc

### Log each generated word and the next few probs