#### Saturday, June 8, 2024

[Training and Finetuning Embedding Models with Sentence Transformers v3](https://huggingface.co/blog/train-sentence-transformers)

This notebook was manually created from the above document. 

*** mamba activate ftllm ***

This notebook was copied from 'sentence-transformers/train-sentence-transformers.ipynb' to retain the original output, then use this to experiment with some of the settings, and then compare the results with this first notebook. 

In [1]:
# only target the 4090 ...
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

The next two cells were generated by Chat Gpt 4o

In [2]:
import torch

fp16 = False
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    compute_capability = torch.cuda.get_device_capability(0)
    print(f"GPU Name: {gpu_name}")
    print(f"Compute Capability: {compute_capability}")
    if compute_capability[0] >= 5:
        print("Your GPU supports FP16 (half-precision).")
        fp16 = True
    else:
        print("Your GPU does not support FP16 (half-precision).")
else:
    print("No CUDA-compatible GPU found.")


GPU Name: NVIDIA GeForce RTX 4090
Compute Capability: (8, 9)
Your GPU supports FP16 (half-precision).


In [3]:
import torch

bf16 = False
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    compute_capability = torch.cuda.get_device_capability(0)
    print(f"GPU Name: {gpu_name}")
    print(f"Compute Capability: {compute_capability}")
    if compute_capability >= (8, 0):  # Ampere architecture and above
        print("Your GPU supports BF16 (bfloat16).")
        bf16 = True
    else:
        print("Your GPU does not support BF16 (bfloat16).")
else:
    print("No CUDA-compatible GPU found.")


GPU Name: NVIDIA GeForce RTX 4090
Compute Capability: (8, 9)
Your GPU supports BF16 (bfloat16).


In [4]:
# We need these next two statements, otherwise we get ...
# NotImplementedError: Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. Please set `NCCL_P2P_DISABLE=\"1\"` and `NCCL_IB_DISABLE=\"1\" or use `accelerate launch` which will do this automatically."
# ... when we try to initialize SentenceTransformerTrainingArguments further on down ... 
os.environ["NCCL_P2P_DISABLE"]="1"
os.environ["NCCL_IB_DISABLE"]="1"

In [5]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

In [6]:
# 1. Load a model to finetune with 2. (Optional) model card data
model = SentenceTransformer(
    "microsoft/mpnet-base",
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="MPNet base trained on AllNLI triplets",
    )
)

# 7m 31.9s

No sentence-transformers model found with name microsoft/mpnet-base. Creating a new one with mean pooling.
Some weights of MPNetModel were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['mpnet.pooler.dense.bias', 'mpnet.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# 3. Load a dataset to finetune on
dataset = load_dataset("sentence-transformers/all-nli", "triplet")
train_dataset = dataset["train"].select(range(100_000))
eval_dataset = dataset["dev"]
test_dataset = dataset["test"]

# 37.3s

In [8]:
# 4. Define a loss function
loss = MultipleNegativesRankingLoss(model)

In [9]:
# tweak some of the below defaults ... default for both was 16
# 1024 was too big!
# 512  was too big!
# 256  was too big!
train_batch_size = 128
eval_batch_size = 128


Override a few of the below defaults.

In [10]:
# 5. (Optional) Specify training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/mpnet-base-all-nli-triplet",
    # Optional training parameters:
    num_train_epochs=1,
    # per_device_train_batch_size=16,
    # per_device_eval_batch_size=16,
    # Override!
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    warmup_ratio=0.1,
    # fp16=True,  # Set to False if GPU can't handle FP16
    # bf16=False,  # Set to True if GPU supports BF16
    # Override!
    # If we try to set both of the below values to True, we get the following error ...
    # ValueError: At most one of fp16 and bf16 can be True, but not both,
    # so set the correct values for the 4090 ...
    # fp16 = fp16,
    # bf16 = bf16, 
    fp16 = False, 
    bf16 = True, 
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicates
    # Optional tracking/debugging parameters:
    # eval_strategy="steps",
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="mpnet-base-all-nli-triplet",  # Used in W&B if `wandb` is installed
)


In [11]:
# 6. (Optional) Create an evaluator & evaluate the base model
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="all-nli-dev",
)
dev_evaluator(model)

{'all-nli-dev_cosine_accuracy': 0.6210510328068044,
 'all-nli-dev_dot_accuracy': 0.45337181044957475,
 'all-nli-dev_manhattan_accuracy': 0.6831713244228432,
 'all-nli-dev_euclidean_accuracy': 0.62226609963548,
 'all-nli-dev_max_accuracy': 0.6831713244228432}

In [12]:
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)

In [13]:
trainer.train()

# 12m 41.0s
# 58m 20.0s

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrobkayinto[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,All-nli-dev Cosine Accuracy,All-nli-dev Dot Accuracy,All-nli-dev Manhattan Accuracy,All-nli-dev Euclidean Accuracy,All-nli-dev Max Accuracy
100,2.7507,1.232818,0.827461,0.174362,0.826245,0.8226,0.827461
200,1.6931,1.155465,0.845535,0.154162,0.84356,0.843104,0.845535
300,1.695,1.14455,0.859204,0.136847,0.853129,0.854648,0.859204
400,1.5986,1.003252,0.875759,0.116039,0.872418,0.871962,0.875759
500,1.5415,0.94553,0.883354,0.109812,0.878038,0.878797,0.883354
600,1.5021,0.860337,0.900516,0.094623,0.893682,0.89353,0.900516
700,1.3398,1.024029,0.893378,0.1048,0.88791,0.889581,0.893378


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=782, training_loss=1.6624924447530371, metrics={'train_runtime': 761.0282, 'train_samples_per_second': 131.401, 'train_steps_per_second': 1.028, 'total_flos': 0.0, 'train_loss': 1.6624924447530371, 'epoch': 1.0})

In [14]:
# (Optional) Evaluate the trained model on the test set, after training completes
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="all-nli-test",
)
test_evaluator(model)

{'all-nli-test_cosine_accuracy': 0.8955969133000454,
 'all-nli-test_dot_accuracy': 0.10062036616734756,
 'all-nli-test_manhattan_accuracy': 0.8875775457709184,
 'all-nli-test_euclidean_accuracy': 0.8895445604478741,
 'all-nli-test_max_accuracy': 0.8955969133000454}

In [15]:
# 8. Save the trained model
# model.save_pretrained("models/mpnet-base-all-nli-triplet/final")
model.save_pretrained("models/mpnet-base-all-nli-triplet-128/final")

In [None]:
# 9. (Optional) Push it to the Hugging Face Hub ... Nope!
# model.push_to_hub("mpnet-base-all-nli-triplet")