In [1]:

!pip install datasets==3.1.0
!pip install transformers==4.45.2 sentence-transformers==2.7.0

Collecting datasets==3.1.0
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==3.1.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets==3.1.0)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets==3.1.0)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets==3.1.0)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloadin

In [2]:
import csv
import gzip
import logging
import math
import os
from datetime import datetime

import torch
from torch.utils.data import DataLoader

from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEF1Evaluator, CESoftmaxAccuracyEvaluator
from sentence_transformers.evaluation import SequentialEvaluator
from sentence_transformers.readers import InputExample

import numpy as np

from sentence_transformers import InputExample
from tqdm.autonotebook import tqdm
from sentence_transformers.evaluation import SentenceEvaluator




A Cross-Encoder is a type of neural network used for sentence pair classification tasks, such as:
- Text similarity scoring
- Reranking search results
- Natural Language Inference (NLI)

Unlike Bi-Encoders, which encode each input separately, Cross-Encoders process both input texts together, allowing deeper interaction between them.<br>

What is "BAAI/bge-reranker-base"? <br>

This is a pretrained model from BAAI (Beijing Academy of Artificial Intelligence), specifically designed for reranking tasks.


In [3]:
model = CrossEncoder("BAAI/bge-reranker-base")

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

Download the question answer data file.

In [4]:
!wget https://raw.githubusercontent.com/reza-rahim/published/refs/heads/main/Fine_tuningEmbeddings/data/generated_qa_pairs_detail.csv

--2025-02-03 04:01:23--  https://raw.githubusercontent.com/reza-rahim/published/refs/heads/main/Fine_tuningEmbeddings/data/generated_qa_pairs_detail.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80469 (79K) [text/plain]
Saving to: ‘generated_qa_pairs_detail.csv’


2025-02-03 04:01:23 (4.17 MB/s) - ‘generated_qa_pairs_detail.csv’ saved [80469/80469]



Reads a CSV file containing questions and answers.
Extracts "Question" and "Answer" from each row.
Stores them in a dictionary with numeric string keys.
The dictionary qa_dict can later be used for quick lookups of Q&A pairs.


In [29]:
# Initialize an index counter for question-answer pairs
ind = 0

# Create an empty dictionary to store question-answer pairs
qa_dict = {}

# Open the CSV file in read mode
with open('generated_qa_pairs_detail.csv', mode='r') as file:
    # Read the CSV file as a dictionary where column names are the keys
    csv_reader = csv.DictReader(file)

    # Iterate through each row in the CSV file
    for row in csv_reader:
        # Extract the "Question" and "Answer" fields and store them in a dictionary
        data = {"q": row['Question'], "a": row['Answer']}

        # Store the question-answer pair in qa_dict using the index as a string key
        qa_dict[str(ind)] = data

        # Increment the index for the next entry
        ind += 1


Creates a mix of entailment and contradiction pairs to train an NLI model.
Helps the model distinguish real question-answer pairs from unrelated ones.
Balances the dataset by ensuring both entailment and contradiction labels exist.

In [37]:
import random

import numpy as np

label2int = {"contradiction": 0, "entailment": 1}

train_samples = []
# Initialize an empty list to store training samples
train_samples = []

# Iterate over each question-answer pair in the dictionary
for key in qa_dict:

    # Extract and clean the question and answer by removing double quotes
    q = qa_dict[key]['q'].replace("\"", "")
    a = qa_dict[key]['a'].replace("\"", "")

    # Add a correct (entailment) question-answer pair to the training set
    train_samples.append(InputExample(texts=[q, a], label=label2int["entailment"]))

    # If the current key index is in the first half of the dataset
    if int(key) < ind / 2:

        # Select a random answer from the second half of the dataset
        random_number = random.randint(ind // 2, ind - 1)
        n = qa_dict[str(random_number)]['a'].replace("\"", "")
        # Treat this as another valid (entailment) pair
        train_samples.append(InputExample(texts=[q, n], label=label2int["contradiction"]))

    # If the current key index is in the second half of the dataset
    else:
        # Select a random answer from the first half of the dataset
        random_number = random.randint(0, (ind // 2) - 1)
        n = qa_dict[str(random_number)]['a'].replace("\"", "")
        # Treat this as a contradiction (incorrect question-answer pair)
        train_samples.append(InputExample(texts=[q, n], label=label2int["contradiction"]))


In [55]:
print(train_samples[0])

<InputExample> label: 1, texts: What is the publication date of the NVIDIA Corporation Annual Report 2024?;  The publication date of the NVIDIA Corporation Annual Report 2024 is February 21st, 2024.


In [56]:
print(train_samples[1])

<InputExample> label: 0, texts: What is the publication date of the NVIDIA Corporation Annual Report 2024?;  The breakdown of other long-term liabilities for NVIDIA Corporation as of January 29, 2023 is: Income tax payable ($1,361 million), Deferred income tax ($462 million), Deferred revenue ($573 million), Licenses payable ($80 million), and Other ($65 million), totaling $2,541 million.



### Define a custom evaluator class for Mean Squared Error (MSE) accuracy evaluation

In [57]:
from __future__ import annotations


# Import logging and set up a logger for debugging and tracking execution
logger = logging.getLogger(__name__)


# Define a custom evaluator class for Mean Squared Error (MSE) accuracy evaluation
class MSEAccuracyEvaluator(SentenceEvaluator):

    def __init__(self, dataloader: DataLoader, name: str = "", write_csv: bool = True, show_progress_bar: bool = True):
        """
        Initializes the evaluator.

        Parameters:
        - dataloader: DataLoader object containing the evaluation dataset.
        - name: (Optional) Name of the dataset (used for logging and file naming).
        - write_csv: Whether to write evaluation results to a CSV file.
        - show_progress_bar: Whether to display a progress bar during evaluation.
        """

        self.dataloader = dataloader
        self.name = name
        self.show_progress_bar = show_progress_bar

        # Define the CSV filename for storing evaluation results
        self.csv_file = "AccuracyEvaluator" + ("_" + name if name else "") + "_results.csv"
        self.csv_headers = ["epoch", "steps", "Accuracy"]
        self.write_csv = write_csv

    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        """
        Runs the evaluation on the given model.

        Parameters:
        - model: The model to be evaluated.
        - output_path: (Optional) Directory to save evaluation results.
        - epoch: The current training epoch.
        - steps: The number of steps completed in the current epoch.

        Returns:
        - mse: The computed Mean Squared Error (MSE) score.
        """

        # Set the model to evaluation mode (disables dropout layers, etc.)
        model.model.eval()
        total = 0  # Counter for the number of batches
        loss_total = 0  # Accumulator for total loss

        # Determine the log message format based on epoch and step information
        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}:"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps:"
        else:
            out_txt = ":"

        # Log the start of evaluation
        logger.info("AccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)

        # Define the loss function (Mean Squared Error) and activation function (Sigmoid)
        loss_fnc = torch.nn.MSELoss()
        activation_fnc = torch.nn.Sigmoid()

        # Use the model's smart batching function to handle input data batching
        self.dataloader.collate_fn = model.smart_batching_collate

        logger.info("Evaluation on the " + self.name + " dataset" + out_txt)

        # Iterate through the dataset and evaluate the model
        for features, labels in tqdm(self.dataloader, desc="Evaluation", smoothing=0.05, disable=not self.show_progress_bar):
            with torch.no_grad():  # Disable gradient calculation to save memory and speed up evaluation
                model_predictions = model.model(**features, return_dict=True)  # Get model predictions
                logits = activation_fnc(model_predictions.logits)  # Apply sigmoid activation

                # Reshape logits if model has a single output label
                if model.config.num_labels == 1:
                    logits = logits.view(-1)

                # Compute the MSE loss
                loss_value = loss_fnc(logits, labels)

            total += 1  # Increment batch counter
            loss_total += loss_value.cpu().item()  # Accumulate loss

        # Compute the final MSE score
        mse = loss_total / total

        # Print the MSE score
        print("MSE: {:.4f} ({}/{})\n".format(mse, loss_total, total))

        # Save evaluation results to a CSV file if output_path is provided
        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)

            # If the CSV file does not exist, create it and write headers
            if not os.path.isfile(csv_path):
                with open(csv_path, newline='', mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([epoch, steps, mse])
            else:
                # Append the results to the existing CSV file
                with open(csv_path, newline='', mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([epoch, steps, mse])

        return mse  # Return the computed MSE score


In [58]:
train_batch_size = 16
num_epochs = 8

### Create Training DataLoader:


In [59]:
# We wrap train_samples, which is a list of InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)


### Create Validation DataLoader:
### Initialize the Evaluator:

In [60]:
val_dataloader = DataLoader(train_samples, shuffle=False, batch_size=train_batch_size)

evaluator = MSEAccuracyEvaluator(val_dataloader,show_progress_bar=True , write_csv=True )

In [61]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logger.info(f"Warmup-steps: {warmup_steps}")

In [62]:
# Train the model
model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=100,
    warmup_steps=warmup_steps,

)

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/23 [00:00<?, ?it/s]

MSE: 0.0253 (0.5817233053576203/23)



Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/23 [00:00<?, ?it/s]

MSE: 0.0030 (0.07011540474832678/23)



Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/23 [00:00<?, ?it/s]

MSE: 0.0000 (0.00018798727631041956/23)



Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/23 [00:00<?, ?it/s]

MSE: 0.0000 (4.902798561134958e-06/23)



Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/23 [00:00<?, ?it/s]

MSE: 0.0000 (2.3226694234601553e-06/23)



Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/23 [00:00<?, ?it/s]

MSE: 0.0000 (1.5129179600226195e-06/23)



Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/23 [00:00<?, ?it/s]

MSE: 0.0000 (1.2275457971222536e-06/23)



Iteration:   0%|          | 0/23 [00:00<?, ?it/s]

Evaluation:   0%|          | 0/23 [00:00<?, ?it/s]

MSE: 0.0000 (1.1415124709923674e-06/23)



In [63]:
model.save('./output/model')

In [64]:
finetune_model = CrossEncoder("./output/model")

In [65]:
sentences = [["What is the publication date of the NVIDIA Corporation Annual Report 2024?", "The publication date of the NVIDIA Corporation Annual Report 2024 is February 21st, 2024."]]
finetune_model.predict(sentences)

array([0.99996173], dtype=float32)

In [66]:
sentences = [["What is the publication date of the NVIDIA Corporation Annual Report 2024?", "The filing dates of the 10-K reports for NVIDIA Corporation in 2004 are May 20th, March 29th, and April 25th."]]
finetune_model.predict(sentences)

array([0.00584958], dtype=float32)

In [67]:
sentences = [["What is the publication date of the NVIDIA Corporation Annual Report 2024?", "GPUs are used to simulate human intelligence, enabling a deeper understanding of the physical world, and are essential for deep learning algorithms due to their parallel processing capabilities, supported by thousands of computing cores."]]
finetune_model.predict(sentences)

array([4.2816377e-05], dtype=float32)

In [68]:
finetune_model.push_to_hub("rezarahim/bge-reranker-base-finetuned")

README.md:   0%|          | 0.00/5.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rezarahim/bge-reranker-base-finetuned/commit/01e6e384160c95d032853ff68c70a21391275786', commit_message='Upload CrossEncoder', commit_description='', oid='01e6e384160c95d032853ff68c70a21391275786', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rezarahim/bge-reranker-base-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='rezarahim/bge-reranker-base-finetuned'), pr_revision=None, pr_num=None)