In [25]:
# NOTE: Based on your requirements, make changes to the variables:  checkpoints_out_dir, dataset_subset, entropy_analysis_path

# dataset
dataset_name = 'clinc_oos'
dataset_subset = 'small'

# model 
checkpoints_out_dir = '../checkpoints/clinc_small/checkpoint-15200'

#device
device = 'cuda:0'

# pipeline
pipeline_task = 'text-classification'

# entropy analysis path
entropy_analysis_path = '../predictions/entropy/clinc_small.csv'

In [26]:
# load test dataset
from datasets import load_dataset

test_data = load_dataset(dataset_name, dataset_subset, split = 'test')
test_data = test_data.rename_column("intent", "label")

Found cached dataset clinc_oos (/work/pi_adrozdov_umass_edu/vpamidimukka_umass_edu/hf_cache/datasets/clinc_oos/small/1.0.0/abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1)


In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer and the model from saved checkpoint
tokenizer = AutoTokenizer.from_pretrained(checkpoints_out_dir)
model = AutoModelForSequenceClassification.from_pretrained(checkpoints_out_dir)

# set model to device
model.to(device)

# set the model to evaluation mode
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [28]:
import torch
from torch.utils.data import DataLoader, TensorDataset

# tokenizing the test dataset
test_encodings = tokenizer(test_data['text'], truncation=True, padding=True, return_tensors='pt')

# create batches
batch_size = 16
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_data["label"]))
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [29]:
from torch.nn.functional import cross_entropy

losses = []
predicted_labels = []
true_labels = []

# calculating the cross entropy for each sentence
for batch in test_dataloader:
    # Unpack the batch and move it to GPU
    input_ids, attention_mask, batch_true_labels = tuple(t.to(device) for t in batch)
    
    # forward pass
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        batch_predicted_labels = torch.argmax(logits, axis = 1)
        
        # calculate the entropy loss
        batch_loss = cross_entropy(logits, batch_true_labels, reduction='none')
        
        losses.extend(batch_loss.tolist())
        predicted_labels.extend(batch_predicted_labels.tolist())
        true_labels.extend(batch_true_labels.tolist())

In [30]:
import pandas as pd

# Save calculated entropy loss as csv file
df = pd.DataFrame([true_labels, predicted_labels, losses])
df = df.transpose()
df.columns = ['True_Label_Index', 'Predicted_Label_Index', 'Entropy Loss']
df = df.reset_index().rename(columns={'index': 'Test_Data_Index'})
df.insert(df.columns.get_loc('Test_Data_Index') + 1, 'Text', [test_data['text'][i] for i in df['Test_Data_Index']])
df.insert(df.columns.get_loc('Text') + 1, 'True Label', [model.config.id2label[l] for l in df['True_Label_Index']])
df.insert(df.columns.get_loc('Predicted_Label_Index') + 1, 'Predicted Label', [model.config.id2label[l] for l in df['Predicted_Label_Index']])
df.to_csv(entropy_analysis_path, index = False)