# The Device

In [1]:
import torch

# GPU
if torch.cuda.is_available():
    device = torch.device(f"cuda:{0}")
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

We will use the GPU: Quadro RTX 8000


# utils

In [126]:
# ! git clone https://github.com/mohsenfayyaz/GlobEnc

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Cloning into 'GlobEnc'...
remote: Enumerating objects: 161, done.[K
remote: Counting objects: 100% (161/161), done.[K
remote: Compressing objects: 100% (120/120), done.[K
remote: Total 161 (delta 88), reused 86 (delta 35), pack-reused 0 (from 0)[K
Receiving objects: 100% (161/161), 9.04 MiB | 3.29 MiB/s, done.
Resolving deltas: 100% (88/88), done.


In [13]:
import torch
import numpy as np
from transformers import AutoTokenizer
from GlobEnc.src.modeling.modeling_bert import BertForSequenceClassification, BertForMaskedLM
from GlobEnc.src.modeling.modeling_electra import ElectraForSequenceClassification
from GlobEnc.src.modeling.modeling_roberta import RobertaForSequenceClassification, RobertaForMaskedLM
from GlobEnc.src.attention_rollout import AttentionRollout

In [14]:
import numpy as np

NUM_LABELS = {
    "ana": 2,
    "dna": 2,
    "dnaa": 2,
    "rpsv": 2,
    "darn": 2,
    "NA": 2,
}

blimp_to_label = {
    'singular': 0,
    'plural': 1,
}

MODEL_PATH = {
    'bert': 'bert-base-uncased',
    'roberta': 'roberta-base',
    'electra': 'google/electra-base-generator',
    'deberta': 'microsoft/deberta-v3-base'
}

BLIMP_TASKS = [
    "ana",
    'dna',
    "dnaa",
    "rpsv",
    "darn",
    "NA",
]

def blimp_to_features(data, tokenizer, max_length, input_masking, mlm):
    all_features = []
    for example in data:
        text = example['sentence_good']
        tokens = []
        cue_indices = []
        # token to id
        for w_ind, word in enumerate(text):
            ids = tokenizer.encode(word, add_special_tokens=False)
            if w_ind in example['cue_indices']:
                cue_indices.append(len(tokens))
            if w_ind == example['target_index']:
                target_index = len(tokens)
            tokens.extend(ids)
        
        tokens = [tokenizer.cls_token_id] + tokens + [tokenizer.sep_token_id]
        cue_indices = [x+1 for x in cue_indices] # 'cause of adding cls
        target_index += 1 # 'cause of adding cls
        if input_masking:
            tokens[target_index] = tokenizer.mask_token_id

        # padding
        length = len(tokens)
        inputs = {}
        inputs['input_ids'] = tokens if max_length is None else tokens + [tokenizer.pad_token_id]*(max_length - length)
        inputs['input_ids'] = torch.tensor(inputs['input_ids']).to(device)
        inputs['attention_mask'] = [1]*length if max_length is None else [1]*length + [0]*(max_length - length)
        inputs['attention_mask'] = torch.tensor(inputs['attention_mask']).to(device)
        inputs['token_type_ids'] = torch.tensor([0]*length if max_length is None else [0]*max_length).to(device)
        inputs['target_index'] = target_index
        
        # As a 2d tensor, we need all rows to have the same length. So, we add -1 to the end of each list.
        inputs['cue_indices'] = cue_indices + (10 - len(cue_indices)) * [-1]

        all_features.append(inputs)
    return all_features[0] if len(all_features) == 1 else all_features

PREPROCESS_FUNC = {
    'ana': blimp_to_features,
    'dna': blimp_to_features,
    'dnaa': blimp_to_features,
    'rpsv': blimp_to_features,
    'darn': blimp_to_features,
    'NA': blimp_to_features,
}

In [15]:
SELECTED_GPU = 0
MODEL_NAME = 'roberta'
FIXED = False
TASK = "NA"
MAX_LENGTH = 32
NUM_TRAIN_EPOCHS = 1
PER_DEVICE_BATCH_SIZE = 1

INPUT_MASKING = True
MLM = True
LEARNING_RATE = 3e-5
LR_SCHEDULER_TYPE = "linear" 
WARMUP_RATIO = 0.1
SEED = 42

import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader
from torch.nn import CrossEntropyLoss

from datasets import (
    load_dataset,
    load_from_disk,
    load_metric,
)

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AdamW,
    get_scheduler,
    default_data_collator,
    set_seed,
)
set_seed(SEED)

# Load Dataset
if TASK in BLIMP_TASKS:
    data_path = f"./BLIMP Dataset/{MODEL_NAME}/"
    data = load_from_disk(data_path)
    train_data = data['train']
    eval_data = data['test']
else:
    print("Not implemented yet!")
    exit()

train_data = train_data.shuffle(SEED)
num_labels = NUM_LABELS[TASK]

# Download Tokenizer & Model
config = AutoConfig.from_pretrained(MODEL_PATH[MODEL_NAME], num_labels=num_labels)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH[MODEL_NAME])  

if MODEL_NAME == "bert":
    model = BertForMaskedLM.from_pretrained(MODEL_PATH[MODEL_NAME], config=config)
elif MODEL_NAME == "roberta":
    model = RobertaForMaskedLM.from_pretrained(MODEL_PATH[MODEL_NAME], config=config)
# elif MODEL_NAME == "electra":
#     model = ElectraForMaskedLM.from_pretrained(MODEL_PATH[MODEL_NAME], config=config)
else:
    print("model doesn't exist")

model.to(device)

# Preprocessing
train_dataset = PREPROCESS_FUNC[TASK](train_data, tokenizer, MAX_LENGTH, input_masking=INPUT_MASKING, mlm=MLM)
eval_dataset = PREPROCESS_FUNC[TASK](eval_data, tokenizer, MAX_LENGTH, input_masking=INPUT_MASKING, mlm=MLM)

train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn= default_data_collator, batch_size=PER_DEVICE_BATCH_SIZE)
eval_dataloader = DataLoader(eval_dataset, collate_fn= default_data_collator, batch_size=PER_DEVICE_BATCH_SIZE)

num_update_steps_per_epoch = len(train_dataloader)
max_train_steps = NUM_TRAIN_EPOCHS * num_update_steps_per_epoch 

# Optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
lr_scheduler = get_scheduler(
        name=LR_SCHEDULER_TYPE,
        optimizer=optimizer,
        num_warmup_steps=WARMUP_RATIO * max_train_steps,
        num_training_steps=max_train_steps,
    )

# metric & Loss
metric = load_metric("accuracy")
loss_fct = CrossEntropyLoss()

tag = "forseqclassification_"
tag += "pretrained" if FIXED else "finetuned" 
if MLM:
    tag += "_MLM"

Loading cached shuffled indices for dataset at BLIMP Dataset/roberta/train/cache-3c138be14de5eff0.arrow


# Evaluation

In [158]:
model.load_state_dict(torch.load(f'{MODEL_NAME}_full_{tag}_epoch{NUM_TRAIN_EPOCHS}.pt'))

<All keys matched successfully>

In [None]:
for batch in eval_dataloader:
    if MLM:
        good_token_id = batch.pop('good_token_id').to(device)
        bad_token_id = batch.pop('bad_token_id').to(device)
    batch.pop('cue_indices').to(device)
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits

    if MLM:
        good_logits = logits[torch.arange(logits.size(0)), good_token_id]
        bad_logits = logits[torch.arange(logits.size(0)), bad_token_id]
        logits_of_interest = torch.stack([good_logits, bad_logits], dim=1)
        labels = torch.zeros(logits_of_interest.shape[0], dtype=torch.int64, device=device)
        predictions = torch.argmax(logits_of_interest, dim=-1)
        metric.add_batch(predictions=predictions, references=labels)
    else:
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch['labels'])

eval_metric = metric.compute()
print(f"epoch {epoch}: {eval_metric}") 

# GlobEnc

The implementations were sourced from https://github.com/mohsenfayyaz/GlobEnc.

In [35]:
# Computing GlobEnc importance scores per instance for all layers
GE_per_layer_scores = list()

for batch_sample in tqdm(eval_dataloader):
    
    batch_lengths = batch_sample['attention_mask'].sum(-1)
    logits, attentions, norms = model(batch_sample['input_ids'].to(device), batch_sample['attention_mask'].to(device), batch_sample['token_type_ids'].to(device), output_attentions=True, output_norms=True, return_dict=False)
    num_layers = len(attentions)
    norm_nenc = torch.stack([norms[i][4] for i in range(num_layers)]).squeeze().cpu().detach().numpy()
    
    GE_per_layer_scores.append(norm_nenc[:, :batch_lengths.item(), :batch_lengths.item()])
    

  0%|          | 0/2208 [00:00<?, ?it/s]

# Alignment Metrics

Here, we compute Dot Product and Average Precision. At first, let's define a method to compute Average Precision.

In [None]:
# Cleaning the tensors from -1 padding
def CI_cleaner(CI):
    first_pad_index = torch.where(CI == -1)[0][0].item() # We have used -1 as paddings of CIs
    return CI[:first_pad_index]

# Calculating Precision
def precision(TP, FP):
    return TP / (TP + FP)

# Calculating Recall
def recall(TP, FN):
    return TP / (TP + FN)

# Calculating Average Precision
def avg_precision(topk, CI):
    R_base = 0 # The starting recall before the first round
    AP, TP, FP, FN = 0, 0, 0, len(CI)
    previous_recall = R_base
    for i in range(len(topk)):
        if topk[i] in CI:
            TP += 1
            FN -= 1
        else:
            FP += 1

        AP += (recall(TP, FN) -  previous_recall) * precision(TP, FP)
        previous_recall = recall(TP, FN)

    return AP

topk = torch.tensor([1, 0, 3, 4, 2])
CI = torch.tensor([3, -1, -1, -1])
# CI = CI_cleaner(CI)
# print(avg_precision(topk, CI))

In [None]:
diagram_layers = range(12)
APs_GE = dict()

for layer in diagram_layers:
    APs_GE[f'layer{layer}'] = list()

sum_GE_scores = 0

model.eval()
for i, batch_sample in enumerate(tqdm(eval_dataloader)):
    
    CI = CI_cleaner(torch.tensor(batch_sample['cue_indices'][0])) # [0]: because we only have one sample in each batch
    
    ### Average Precision
    batch_lengths = batch_sample['attention_mask'].sum(axis=-1)
    mask_index = batch_sample['target_index'] # mask_index = target_index

    # The contribution of each token in the sequence in building the rep. of target token for different layers
    GE_importance = GE_per_layer_scores[i][:, batch_sample['target_index']] # shape: [12, seq_len]
    # Convert to torch tensor form numpy ndarray
    GE_importance = torch.from_numpy(GE_importance)
    # batch_lengths[0]: because we only have one sample in each batch
    GE_importance_topk = torch.topk(GE_importance, k=batch_lengths[0], largest=True, dim=1).indices
    
    ### excluding mask_index
    mask_index_tensor = torch.full_like(GE_importance_topk, mask_index.item())
    # Create a mask that is True for elements not equal to mask_index
    mask = GE_importance_topk != mask_index_tensor
    # # Apply the mask to exclude mask_index
    GE_importance_topk_filtered = GE_importance_topk[mask].view(GE_importance_topk.size(0), -1)

    for layer, layer_importance in enumerate(GE_importance_topk_filtered):
        APs_GE[f'layer{layer}'].append(avg_precision(layer_importance, CI))
        
    ### Dot Product
    # Remove mask_index and then normalize the scores.
    GE_scores = GE_per_layer_scores[i][:, batch_sample['target_index']]
    GE_scores = torch.from_numpy(GE_scores)
    GE_scores = torch.concat((GE_scores[:, :mask_index.item()], GE_scores[:, mask_index.item() + 1:]), dim=1)
    GE_scores = GE_scores / GE_scores.sum(dim=-1, keepdim=True)
    
    if CI[-1] > mask_index:
        CI_scores = GE_scores[:, CI - 1] # Because of the removed mask_index
    else:
        CI_scores = GE_scores[:, CI]

    # In case there are more than one cue indices (i.e. evidence)
    if CI.shape[0] > 1:
        CI_scores = CI_scores.sum(axis=1, keepdim=True)
        
    sum_GE_scores += CI_scores

print("### Dot Product ###")
print(sum_GE_scores / len(eval_dataloader))

print("### Average Precision ###")
temp_list = list()
for layer in diagram_layers:
    temp_list.append(sum(APs_GE[f'layer{layer}']) / len(APs_GE[f'layer{layer}']))

print(temp_list)


In [None]:
# GlobEnc pre-trained
### Dot Product ###
[0.1049, 0.1069, 0.1134, 0.1020, 0.1128, 0.1199, 0.1050, 0.1192, 0.1213, 0.1047, 0.1144, 0.1303]
### Average Precision ###
[0.29353671136159953, 0.3428355231122479, 0.3661429698928716, 0.2674434673360707, 0.31858515302635293, 0.34495581585853935,
 0.2842268998074423, 0.34539070185299914, 0.3265184481704264, 0.2922642782829635, 0.3191821681466628, 0.352063390024211]

# GlobEnc Finetuned
### Dot Product ###
[0.1055, 0.1067, 0.1150, 0.1006, 0.1124, 0.1160, 0.1051, 0.1037, 0.1312, 0.0940, 0.1273, 0.0919]
### Average Precision ###
[0.29497986424836653, 0.34115687026569663, 0.3699041272101137, 0.2659901258731381, 0.30648929408720155, 0.32037714502900505,
 0.28131746623844195, 0.28891366741031027, 0.36654652620175787, 0.2702167037045549, 0.3625181160677095, 0.23063080852453038]