# Cross encoder Transformer

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import torch
from datasets import load_dataset

model = AutoModelForSequenceClassification.from_pretrained(
    'cross-encoder/nli-roberta-base')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-roberta-base')

# Load datasets
train_dataset = load_dataset("snli", split='train')
test_dataset = load_dataset("snli", split='test')
val_dataset = load_dataset("snli", split='validation')

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset snli (/Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Found cached dataset snli (/Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Found cached dataset snli (/Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


In [2]:
# Filter out examples with label -1
train_dataset_filtered = train_dataset.filter(
    lambda example: example['label'] != -1)
test_dataset_filtered = test_dataset.filter(
    lambda example: example['label'] != -1)
val_dataset_filtered = val_dataset.filter(
    lambda example: example['label'] != -1)

Loading cached processed dataset at /Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-3fb44ea69c4768d5.arrow
Loading cached processed dataset at /Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-96d7b47e8248bd0b.arrow
Loading cached processed dataset at /Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-0183042e7957df42.arrow


In [3]:
# Tokenize, pad, and encode filtered sets
train_encodings_filtered = tokenizer(train_dataset_filtered['premise'], train_dataset_filtered['hypothesis'],
                                     padding=True, truncation=True, max_length=128, return_tensors='pt')
test_encodings_filtered = tokenizer(test_dataset_filtered['premise'], test_dataset_filtered['hypothesis'],
                                    padding=True, truncation=True, max_length=128, return_tensors='pt')
val_encodings_filtered = tokenizer(val_dataset_filtered['premise'], val_dataset_filtered['hypothesis'],
                                   padding=True, truncation=True, max_length=128, return_tensors='pt')

In [4]:
# Convert labels to numerical values
label_encoder = LabelEncoder()
train_labels_filtered_encoded = label_encoder.fit_transform(
    train_dataset_filtered['label'])
test_labels_filtered_encoded = label_encoder.transform(
    test_dataset_filtered['label'])
val_labels_filtered_encoded = label_encoder.transform(
    val_dataset_filtered['label'])

In [13]:
test_labels_filtered_encoded

array([1, 0, 2, ..., 2, 0, 1])

Testing the model on snli dataset

In [16]:
#First we see the configuration of the id2label to map our labels. 

from transformers import AutoModelForSequenceClassification, AutoConfig

config = AutoConfig.from_pretrained('cross-encoder/nli-roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-roberta-base')

print(config.id2label)


{0: 'contradiction', 1: 'entailment', 2: 'neutral'}


In [17]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Map ground truth labels to label names
label_mapping = { 0: 'contradiction', 1: 'entailment', 2: 'neutral'}
ground_truth_labels = [label_mapping[label] for label in test_labels_filtered_encoded]

# Define batch size and number of workers for data loader
batch_size = 16
num_workers = 2

# Define data loader
test_loader = DataLoader(list(zip(test_dataset_filtered['premise'], test_dataset_filtered['hypothesis'], ground_truth_labels)),
                         batch_size=batch_size, shuffle=False, num_workers=num_workers)

# Move model to device and enable data parallelism if using multiple GPUs
if torch.cuda.device_count() > 1:
    print(f'Using {torch.cuda.device_count()} GPUs')
    model = torch.nn.DataParallel(model)
model.to(device)

# Evaluate model on test set
model.eval()
with torch.no_grad():
    total_correct = 0
    total_samples = 0
    for batch in test_loader:
        inputs = tokenizer(batch[0], batch[1], padding=True,
                           truncation=True, max_length=128, return_tensors="pt")
        labels = batch[2]
        inputs = {key: val.to(device) for key, val in inputs.items()}
        outputs = model(**inputs)
        predicted_labels = [label_mapping[prediction]
                            for prediction in torch.argmax(outputs.logits, dim=1).tolist()]
        total_correct += sum(1 for i in range(len(predicted_labels))
                             if predicted_labels[i] == labels[i])
        total_samples += len(predicted_labels)

# Compute accuracy
accuracy = total_correct / total_samples
print(f'Total samples: {total_samples}')
print(f'Total correct: {total_correct}')
print(f'Accuracy: {accuracy:.2f}')


Using device: cpu
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
