Chapter 3. Comparison between humans and Transformer language models in plausibility processing patterns

In [1]:
import pandas as pd
import numpy as np
import sys
import torch

In [2]:
def calculate_surprisal(sentence, target_word, model, tokenizer):
    """
    Calculate the surprisal of a target word within a given sentence using a specified model and tokenizer.
    
    Parameters:
        sentence (str): The sentence within which the word exists.
        target_word (str): The specific word whose surprisal to be calculated.
        model (torch.nn.Module): The PyTorch model used for prediction.
        tokenizer: The tokenizer used for word tokenization.

    Returns:
        surprisal_value (float): The surprisal of the target word.
    """

    # Tokenize the sentence
    indexed_tokens = tokenizer.encode(sentence)
    tokens = [tokenizer.decode(index).strip().replace(' ','') for index in indexed_tokens]


    # Also move the model to the same device
    #model.to(device)
    if(target_word in tokens):
    
        # Locate the target word within the tokens
        target_word_location = tokens.index(target_word)

        # Index the token prefixes up to the target word location
        prefix_index = indexed_tokens[:target_word_location]

        # Convert the prefix tokens to tensor and move the tensor to device
        prefix_tensor = torch.tensor([prefix_index])

    # Make predictions with the model while excluding gradient computation
        with torch.no_grad():
            output = model(prefix_tensor)
            predictions = torch.nn.functional.softmax(output[0], dim=-1)
            prediction_result = predictions[0, -1, :]
        
            # Calculate the surprisal of the target word
            score_of_target_word = prediction_result[indexed_tokens[target_word_location]]
            surprisal_of_target_word = -torch.log2(score_of_target_word).numpy()

        return float(surprisal_of_target_word)
    else:
        return None;

In [3]:
## Loading Models
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AlbertTokenizer, AlbertForMaskedLM
from transformers import BertTokenizer, BertLMHeadModel
from transformers import RobertaTokenizer, RobertaForMaskedLM

tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2')
model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2', output_attentions = True)

tokenizer_Albert = AlbertTokenizer.from_pretrained('albert-base-v2', output_attentions = True)
model_Albert = AlbertForMaskedLM.from_pretrained('albert-base-v2')

tokenizer_BERT = BertTokenizer.from_pretrained('bert-base-uncased')
model_BERT = BertLMHeadModel.from_pretrained('bert-base-uncased',output_attentions = True)

tokenizer_roBERTa = RobertaTokenizer.from_pretrained('roberta-base')
model_roBERTa = RobertaForMaskedLM.from_pretrained("roberta-base", output_attentions = True)


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMaskedLM: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`


In [4]:
# Extract unique IDs.
cunnings2018 = pd.read_csv("cunnings2018.csv")
unique_ids = np.unique(cunnings2018["id"])

# Loop through each unique id. 
for id in unique_ids:
    # Filter rows for the specific 'id'.
    data_set = cunnings2018[cunnings2018['id'] == id]
    
    # Collect specific sentences based on 'cond' field.
    sentence_plpl = data_set[data_set['cond'] == 'pl_pl'].sent.item()
    sentence_plimpl = data_set[data_set['cond'] == 'pl_impl'].sent.item()
    sentence_implpl = data_set[data_set['cond'] == 'impl_pl'].sent.item()
    sentence_implimpl = data_set[data_set['cond'] == 'impl_impl'].sent.item()
    
    # Collect other specific fields based on 'cond' field.
    pl_subj = data_set[data_set['cond'] == 'pl_impl'].subj.item()
    impl_subj = data_set[data_set['cond'] == 'impl_impl'].subj.item()
    pl_dist = data_set[data_set['cond'] == 'pl_pl'].dist.item()
    impl_dist = data_set[data_set['cond'] == 'pl_impl'].dist.item()

    # Get the 'verb' field
    verb = data_set[data_set['cond'] == 'pl_impl'].verb.item()
    #print(id, sentence_plpl, verb)
    
    # Tokenize sentences.
    tokens_plpl = tokenizer_Albert.encode(sentence_plpl)
    tokens_implimpl = tokenizer_Albert.encode(sentence_implimpl)
    plpl_tokens = [tokenizer_Albert.decode(token).strip().replace(' ','')  for token in tokens_plpl]
    implimpl_tokens = [tokenizer_Albert.decode(token).strip().replace(' ','')  for token in tokens_implimpl]
    
    plpl_surprisal_albert = calculate_surprisal(sentence_plpl, verb, model_Albert, tokenizer_Albert)
    plimpl_surprisal_albert = calculate_surprisal(sentence_plimpl, verb, model_Albert,tokenizer_Albert)
    implpl_surprisal_albert = calculate_surprisal(sentence_implpl, verb, model_Albert, tokenizer_Albert)
    implimpl_surprisal_albert = calculate_surprisal(sentence_implimpl, verb, model_Albert, tokenizer_Albert)
    
    plpl_surprisal_bert = calculate_surprisal(sentence_plpl, verb, model_BERT, tokenizer_BERT)
    plimpl_surprisal_bert = calculate_surprisal(sentence_plimpl, verb, model_BERT,tokenizer_BERT)
    implpl_surprisal_bert = calculate_surprisal(sentence_implpl, verb, model_BERT, tokenizer_BERT)
    implimpl_surprisal_bert = calculate_surprisal(sentence_implimpl, verb, model_BERT, tokenizer_BERT)
    
    plpl_surprisal_gpt = calculate_surprisal(sentence_plpl, verb, model_gpt2, tokenizer_gpt2)
    plimpl_surprisal_gpt = calculate_surprisal(sentence_plimpl, verb, model_gpt2,tokenizer_gpt2)
    implpl_surprisal_gpt = calculate_surprisal(sentence_implpl, verb, model_gpt2, tokenizer_gpt2)
    implimpl_surprisal_gpt = calculate_surprisal(sentence_implimpl, verb, model_gpt2, tokenizer_gpt2)

    plpl_surprisal_roBERTa = calculate_surprisal(sentence_plpl, verb, model_roBERTa, tokenizer_roBERTa)
    plimpl_surprisal_roBERTa = calculate_surprisal(sentence_plimpl, verb, model_roBERTa,tokenizer_roBERTa)
    implpl_surprisal_roBERTa = calculate_surprisal(sentence_implpl, verb, model_roBERTa, tokenizer_roBERTa)
    implimpl_surprisal_roBERTa = calculate_surprisal(sentence_implimpl, verb, model_roBERTa, tokenizer_roBERTa)

    
    # Set the 'surprisal' score for each 'cond' in the original DataFrame 'cunnings2018'.
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'pl_pl'), 'surprisal_Albert'] = plpl_surprisal_albert
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'impl_pl'), 'surprisal_Albert'] = plimpl_surprisal_albert
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'pl_impl'), 'surprisal_Albert'] = implpl_surprisal_albert
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'impl_impl'), 'surprisal_Albert'] = implimpl_surprisal_albert
    
    # Set the 'surprisal' score for each 'cond' in the original DataFrame 'cunnings2018'.
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'pl_pl'), 'surprisal_BERT'] = plpl_surprisal_bert
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'impl_pl'), 'surprisal_BERT'] = implpl_surprisal_bert
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'pl_impl'), 'surprisal_BERT'] = plimpl_surprisal_bert
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'impl_impl'), 'surprisal_BERT'] = implimpl_surprisal_bert
        
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'pl_pl'), 'surprisal_gpt2'] = plpl_surprisal_gpt
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'impl_pl'), 'surprisal_gpt2'] = implpl_surprisal_gpt
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'pl_impl'), 'surprisal_gpt2'] = plimpl_surprisal_gpt
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'impl_impl'), 'surprisal_gpt2'] = implimpl_surprisal_gpt


    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'pl_pl'), 'surprisal_roBERTa'] = plpl_surprisal_roBERTa
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'impl_pl'), 'surprisal_roBERTa'] = implpl_surprisal_roBERTa
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'pl_impl'), 'surprisal_roBERTa'] = plimpl_surprisal_roBERTa
    cunnings2018.loc[(cunnings2018['id'] == id) & (cunnings2018['cond'] == 'impl_impl'), 'surprisal_roBERTa'] = implimpl_surprisal_roBERTa


In [None]:
cunnings2018.to_csv('surprisals.csv')