# Cross-encoder Transformer and fine-tuning

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import torch
from datasets import load_dataset

model = AutoModelForSequenceClassification.from_pretrained(
    'cross-encoder/nli-roberta-base')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-roberta-base')

# Load datasets
train_dataset = load_dataset("snli", split='train')
test_dataset = load_dataset("snli", split='test')
val_dataset = load_dataset("snli", split='validation')

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset snli (/Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Found cached dataset snli (/Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
Found cached dataset snli (/Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


Preprocessing

In [4]:
# Filter out examples with label -1
train_dataset_filtered = train_dataset.filter(
    lambda example: example['label'] != -1)
test_dataset_filtered = test_dataset.filter(
    lambda example: example['label'] != -1)
val_dataset_filtered = val_dataset.filter(
    lambda example: example['label'] != -1)

Loading cached processed dataset at /Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-3fb44ea69c4768d5.arrow
Loading cached processed dataset at /Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-96d7b47e8248bd0b.arrow
Loading cached processed dataset at /Users/sarrabenyahia/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b/cache-0183042e7957df42.arrow


In [5]:
# Tokenize, pad, and encode filtered sets
train_encodings_filtered = tokenizer(
    train_dataset_filtered['premise'], train_dataset_filtered['hypothesis'], padding=True, truncation=True, max_length=128)
test_encodings_filtered = tokenizer(
    test_dataset_filtered['premise'], test_dataset_filtered['hypothesis'], padding=True, truncation=True, max_length=128)
val_encodings_filtered = tokenizer(
    val_dataset_filtered['premise'], val_dataset_filtered['hypothesis'], padding=True, truncation=True, max_length=128)

In [6]:
# Convert labels to numerical values
label_encoder = LabelEncoder()
train_labels_filtered_encoded = label_encoder.fit_transform(
    train_dataset_filtered['label'])
test_labels_filtered_encoded = label_encoder.transform(
    test_dataset_filtered['label'])
val_labels_filtered_encoded = label_encoder.transform(
    val_dataset_filtered['label'])

In [8]:
# Convert inputs and labels to list of dictionaries
train_inputs = {'input_ids': train_encodings_filtered['input_ids'],
                'attention_mask': train_encodings_filtered['attention_mask']}
train_data = []
for i in range(len(train_encodings_filtered['input_ids'])):
    train_data.append({key: torch.tensor(val[i])
                      for key, val in train_inputs.items()})
train_data = [{'input_ids': input['input_ids'], 'attention_mask': input['attention_mask'],
               'labels': label} for input, label in zip(train_data, train_labels_filtered_encoded)]

val_inputs = {'input_ids': val_encodings_filtered['input_ids'],
              'attention_mask': val_encodings_filtered['attention_mask']}
val_data = []
for i in range(len(val_encodings_filtered['input_ids'])):
    val_data.append({key: torch.tensor(val[i])
                    for key, val in val_inputs.items()})
val_data = [{'input_ids': input['input_ids'], 'attention_mask': input['attention_mask'],
             'labels': label} for input, label in zip(val_data, val_labels_filtered_encoded)]

test_inputs = {'input_ids': test_encodings_filtered['input_ids'],
               'attention_mask': test_encodings_filtered['attention_mask']}
test_data = []
for i in range(len(test_encodings_filtered['input_ids'])):
    test_data.append({key: torch.tensor(val[i])
                     for key, val in test_inputs.items()})
test_data = [{'input_ids': input['input_ids'], 'attention_mask': input['attention_mask'],
              'labels': label} for input, label in zip(test_data, test_labels_filtered_encoded)]

Finetuning the model

In [9]:
from sklearn.metrics import accuracy_score
import random

# Train the model on a smaller sample of the training data
random.seed(42)
train_data_sample = random.sample(train_data, k=10000)

# Fine-tune the model
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='steps',
    eval_steps=500,
    load_best_model_at_end=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,  # increase the eval batch size
    num_train_epochs=1,
    metric_for_best_model='eval_loss',  # use eval_loss as the evaluation metric
    warmup_steps=0.1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_sample,
    eval_dataset=val_data,
    compute_metrics=lambda pred: {"accuracy": accuracy_score(
        pred.label_ids, pred.predictions.argmax(axis=1))}
)

trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1250
  Number of trainable parameters = 124647939


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 40%|████      | 500/1250 [18:07<27:23,  2.19s/it] ***** Running Evaluation *****
  Num examples = 9842
  Batch size = 32


{'loss': 0.4862, 'learning_rate': 3.0002400192015362e-05, 'epoch': 0.4}


                                                  
 40%|████      | 500/1250 [25:11<27:23,  2.19s/it]Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json


{'eval_loss': 0.3121122717857361, 'eval_accuracy': 0.8962609225767121, 'eval_runtime': 424.5116, 'eval_samples_per_second': 23.184, 'eval_steps_per_second': 0.726, 'epoch': 0.4}


Model weights saved in ./results/checkpoint-500/pytorch_model.bin
 80%|████████  | 1000/1250 [42:24<05:41,  1.36s/it]   ***** Running Evaluation *****
  Num examples = 9842
  Batch size = 32


{'loss': 0.3774, 'learning_rate': 1.0000800064005119e-05, 'epoch': 0.8}


                                                   
 80%|████████  | 1000/1250 [49:31<05:41,  1.36s/it]Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json


{'eval_loss': 0.3315350413322449, 'eval_accuracy': 0.90367811420443, 'eval_runtime': 427.1176, 'eval_samples_per_second': 23.043, 'eval_steps_per_second': 0.721, 'epoch': 0.8}


Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
100%|██████████| 1250/1250 [56:53<00:00,  2.15s/it]   

Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results/checkpoint-500 (score: 0.3121122717857361).
100%|██████████| 1250/1250 [56:54<00:00,  2.73s/it]

{'train_runtime': 3414.2151, 'train_samples_per_second': 2.929, 'train_steps_per_second': 0.366, 'train_loss': 0.41317313232421876, 'epoch': 1.0}





TrainOutput(global_step=1250, training_loss=0.41317313232421876, metrics={'train_runtime': 3414.2151, 'train_samples_per_second': 2.929, 'train_steps_per_second': 0.366, 'train_loss': 0.41317313232421876, 'epoch': 1.0})

Testing the model 

In [10]:
# Evaluate the model on test dataset
eval_result = trainer.evaluate(test_data)
print(eval_result)

***** Running Evaluation *****
  Num examples = 9824
  Batch size = 32
100%|██████████| 307/307 [04:45<00:00,  1.08it/s]

{'eval_loss': 0.32171836495399475, 'eval_accuracy': 0.8927117263843648, 'eval_runtime': 286.1031, 'eval_samples_per_second': 34.337, 'eval_steps_per_second': 1.073, 'epoch': 1.0}





In [11]:
# Extract the metric names and values from the evaluation result
metric_names = list(eval_result.keys())
metric_values = list(eval_result.values())

# Print the metric names and values
for name, value in zip(metric_names, metric_values):
    print('{}: {:.3f}'.format(name, value))

eval_loss: 0.322
eval_accuracy: 0.893
eval_runtime: 286.103
eval_samples_per_second: 34.337
eval_steps_per_second: 1.073
epoch: 1.000


Testing on the validation set 

In [13]:
# Evaluate the model on val dataset
eval_result_val = trainer.evaluate(val_data)
print(eval_result_val)

***** Running Evaluation *****
  Num examples = 9842
  Batch size = 32
100%|██████████| 308/308 [07:07<00:00,  1.39s/it]

{'eval_loss': 0.3121122717857361, 'eval_accuracy': 0.8962609225767121, 'eval_runtime': 429.2229, 'eval_samples_per_second': 22.93, 'eval_steps_per_second': 0.718, 'epoch': 1.0}





Visualization of the results

Let's test with sentences outside of the SNLI dataset

In [49]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    'results_final_non_overfitted_accuracy_metrics')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-roberta-base')

# Define the two example sentences to test
sentence1 = "The cat chased the mouse."
sentence2 = "The mouse was chased by the cat"

# Encode the two sentences
encoding = tokenizer(sentence1, sentence2, padding=True,
                     truncation=True, max_length=128, return_tensors='pt')

# Feed the encoding into the model and get the predicted label
outputs = model(**encoding)
predicted_label = torch.argmax(outputs.logits).item()

# Print the predicted label
print("The predicted label for the pair of sentences is:",
      predicted_label)  # Should be entailment -> 0

loading configuration file results_final_non_overfitted_accuracy_metrics/config.json
Model config RobertaConfig {
  "_name_or_path": "results_final_non_overfitted_accuracy_metrics",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "contradiction",
    "1": "entailment",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_ve

The predicted label for the pair of sentences is: 0


Saving the model

In [None]:
# Save the fine-tuned model
output_dir = "./results_final_non_overfitted_accuracy_metrics"
trainer.save_model(output_dir)

In [50]:
# Define the two example sentences to test
sentence1 = "The man is playing soccer."
sentence2 = "The woman is cooking dinner."

# Encode the two sentences
encoding = tokenizer(sentence1, sentence2, padding=True,
                     truncation=True, max_length=128, return_tensors='pt')

# Feed the encoding into the model and get the predicted label
outputs = model(**encoding)
predicted_label = torch.argmax(outputs.logits).item()

# Print the predicted label
print("The predicted label for the pair of sentences is:",
      predicted_label)  # Should be contradiction -> 2

The predicted label for the pair of sentences is: 2


In [51]:
sentence1 = "The bird is flying in the sky."
sentence2 = "The sky is blue."

encoding = tokenizer(sentence1, sentence2, padding=True,
                     truncation=True, max_length=128, return_tensors='pt')

outputs = model(**encoding)
predicted_label = torch.argmax(outputs.logits).item()

print("The predicted label for the pair of sentences is:",
      predicted_label)  # Should be neutral -> 1

The predicted label for the pair of sentences is: 1


In [52]:
sentence1 = "The boy is jumping on the trampoline."
sentence2 = "The girl is reading a book."

encoding = tokenizer(sentence1, sentence2, padding=True,
                     truncation=True, max_length=128, return_tensors='pt')

outputs = model(**encoding)
predicted_label = torch.argmax(outputs.logits).item()

# Print the predicted label
print("The predicted label for the pair of sentences is:",
      predicted_label)  # Should be contradiction -> 2

The predicted label for the pair of sentences is: 2
