#### Saturday, June 8, 2024

[Training and Finetuning Embedding Models with Sentence Transformers v3](https://huggingface.co/blog/train-sentence-transformers)

This notebook was manually created from the above document. 

*** mamba activate ftllm ***

In [1]:
# only target the 4090 ...
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
# We need these next two statements, otherwise we get ...
# NotImplementedError: Using RTX 4000 series doesn't support faster communication broadband via P2P or IB. Please set `NCCL_P2P_DISABLE=\"1\"` and `NCCL_IB_DISABLE=\"1\" or use `accelerate launch` which will do this automatically."
# ... when we try to initialize SentenceTransformerTrainingArguments further on down ... 
os.environ["NCCL_P2P_DISABLE"]="1"
os.environ["NCCL_IB_DISABLE"]="1"

In [3]:
from datasets import load_dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

In [4]:
# 1. Load a model to finetune with 2. (Optional) model card data
model = SentenceTransformer(
    "microsoft/mpnet-base",
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="MPNet base trained on AllNLI triplets",
    )
)

# 7m 31.9s

No sentence-transformers model found with name microsoft/mpnet-base. Creating a new one with mean pooling.
Some weights of MPNetModel were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['mpnet.pooler.dense.bias', 'mpnet.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 3. Load a dataset to finetune on
dataset = load_dataset("sentence-transformers/all-nli", "triplet")
train_dataset = dataset["train"].select(range(100_000))
eval_dataset = dataset["dev"]
test_dataset = dataset["test"]

# 37.3s

In [6]:
# 4. Define a loss function
loss = MultipleNegativesRankingLoss(model)

In [None]:
bf16 = torch.cuda.is_bf16_supported()

In [7]:
# 5. (Optional) Specify training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/mpnet-base-all-nli-triplet",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if GPU can't handle FP16
    bf16=False,  # Set to True if GPU supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicates
    # Optional tracking/debugging parameters:
    # eval_strategy="steps",
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="mpnet-base-all-nli-triplet",  # Used in W&B if `wandb` is installed
)


In [8]:
# 6. (Optional) Create an evaluator & evaluate the base model
dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="all-nli-dev",
)
dev_evaluator(model)

{'all-nli-dev_cosine_accuracy': 0.6210510328068044,
 'all-nli-dev_dot_accuracy': 0.45337181044957475,
 'all-nli-dev_manhattan_accuracy': 0.6831713244228432,
 'all-nli-dev_euclidean_accuracy': 0.62226609963548,
 'all-nli-dev_max_accuracy': 0.6831713244228432}

In [9]:
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)

In [10]:
trainer.train()

# 58m 20.0s

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrobkayinto[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,All-nli-dev Cosine Accuracy,All-nli-dev Dot Accuracy,All-nli-dev Manhattan Accuracy,All-nli-dev Euclidean Accuracy,All-nli-dev Max Accuracy
100,2.6438,1.087817,0.771871,0.28144,0.793894,0.777491,0.793894
200,0.9332,0.838646,0.802552,0.219775,0.808627,0.802855,0.808627
300,1.2663,0.827029,0.810753,0.194866,0.805286,0.805741,0.810753
400,0.8073,0.84885,0.803615,0.195778,0.803919,0.797539,0.803919
500,0.716,1.023592,0.784933,0.219927,0.788275,0.781288,0.788275
600,0.9718,1.268849,0.778554,0.246051,0.779313,0.778706,0.779313
700,0.8126,1.36464,0.777035,0.225243,0.780529,0.776883,0.780529
800,1.0304,1.444258,0.751215,0.266859,0.752126,0.750152,0.752126
900,1.0763,1.103864,0.78068,0.214611,0.77825,0.779617,0.78068
1000,1.0168,1.137608,0.770352,0.226154,0.768834,0.768682,0.770352


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=6250, training_loss=0.7685923744735121, metrics={'train_runtime': 3500.0427, 'train_samples_per_second': 28.571, 'train_steps_per_second': 1.786, 'total_flos': 0.0, 'train_loss': 0.7685923744735121, 'epoch': 1.0})

In [11]:
# (Optional) Evaluate the trained model on the test set, after training completes
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="all-nli-test",
)
test_evaluator(model)

{'all-nli-test_cosine_accuracy': 0.9140565894991678,
 'all-nli-test_dot_accuracy': 0.08533817521561507,
 'all-nli-test_manhattan_accuracy': 0.9073990013617794,
 'all-nli-test_euclidean_accuracy': 0.9080042366469965,
 'all-nli-test_max_accuracy': 0.9140565894991678}

In [12]:
# 8. Save the trained model
model.save_pretrained("models/mpnet-base-all-nli-triplet/final")

In [None]:
# 9. (Optional) Push it to the Hugging Face Hub ... Nope!
# model.push_to_hub("mpnet-base-all-nli-triplet")