In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from tqdm import tqdm

def calculate_perplexity_with_bert(texts, tokenizer, model, device='cpu'):
    """
    Calculate the perplexity for a list of texts using BERT in a masked language modeling fashion.
    
    Args:
    texts (list of str): The texts to calculate perplexity for.
    tokenizer: The tokenizer compatible with the model.
    model: The language model (e.g., BERT).
    device (str): The device to run the calculations on ('cpu' or 'cuda').

    Returns:
    list of float: The perplexity values for each text.
    """
    model.eval()  # Set the model to evaluation mode
    model.to(device)  # Ensure the model is on the correct device
    perplexities = []

    for text in tqdm(texts, desc="Calculating Perplexity"):
        inputs = tokenizer(text, return_tensors='pt', max_length=128, truncation=True, padding='max_length').to(device)
        
        # Create labels by copying input_ids and masking a random word
        labels = inputs['input_ids'].clone()
        # Randomly mask one token in the input (other strategies can also be used)
        rand_index = torch.randint(0, inputs['input_ids'].size(1), (1,))
        inputs['input_ids'][0, rand_index] = tokenizer.mask_token_id

        with torch.no_grad():  # Disable gradient calculation
            outputs = model(**inputs, labels=labels)
            
            
            loss = outputs.loss
            perplexity = torch.exp(loss).item()  # Calculate perplexity
            perplexities.append(perplexity)
    
    return perplexities



In [7]:
device = 'cuda'
def calculate_perplexity(texts, tokenizer, model):
    model.eval()
    model.to(device)
    perplexities = []
    for text in tqdm(texts, desc="Calculating Perplexity"):
        inputs = tokenizer(text, return_tensors='pt', max_length=128, truncation=True, padding='max_length').to(device)
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss
            perplexity = torch.exp(loss).item()
            perplexities.append(perplexity)
    return perplexities


In [8]:
# Example usage
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Example texts
texts = ["Hello, how are you?", "The quick brown fox jumps over the lazy dog."]

# Calculate perplexities
perplexities = calculate_perplexity(texts, tokenizer, model)

print(perplexities)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Calculating Perplexity: 100%|█████████████████████| 2/2 [00:00<00:00, 14.25it/s]

[172285008.0, 24890076.0]





In [15]:
model.eval()
model.to(device)
perplexities = []
for text in tqdm(texts, desc="Calculating Perplexity"):
    inputs = tokenizer(text, return_tensors='pt', max_length=128, truncation=True, padding='max_length').to(device)
    print(inputs['input_ids'])
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
        print(outputs)
        loss = outputs.loss
        perplexity = torch.exp(loss).item()
        perplexities.append(perplexity)

Calculating Perplexity: 100%|█████████████████████| 2/2 [00:00<00:00, 29.30it/s]

tensor([[ 101, 7592, 1010, 2129, 2024, 2017, 1029,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]], device='cuda:0')
MaskedLMOutput(loss=tensor(18.9647, device='cuda:0'), logits=tensor([[[ -8.1436,  -8.0535,  -8.0647,  ...,  -7.036




In [11]:
perplexities

[172285008.0, 24890076.0]

In [17]:
inputs['input_ids']

tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [18]:
inputs['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')

In [20]:
outputs = model(**inputs, labels=inputs['input_ids'])

In [22]:
outputs['loss']

tensor(17.0300, device='cuda:0', grad_fn=<NllLossBackward0>)

In [24]:
torch.exp(loss).item()

24890076.0