<a href="https://colab.research.google.com/github/raviakasapu/LLM-Training-Docs/blob/main/02_Training_a_sentence_transformer_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install sentence-transformers

In [2]:
%%capture
!pip install wandb

In [3]:
%%capture
!pip install torch

In [4]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
%%capture
!pip install datasets

In [7]:
from datasets import load_dataset

from sentence_transformers import (
    SentenceTransformer,
    models,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)

## Step 1: use an existing language model
word_embedding_model = models.Transformer('distilroberta-base')

## Step 2: use a pool function over the token embeddings
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

## Join steps 1 and 2 using the modules argument
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [28]:
from datasets import load_dataset

dataset = load_dataset("sentence-transformers/all-nli", "triplet")
# dataset_id = "embedding-data/sentence-compression"


README.md:   0%|          | 0.00/5.15k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/38.4M [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/782k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/810k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/557850 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/6584 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6609 [00:00<?, ? examples/s]

In [30]:
print(f"- The dataset has {dataset['train'].num_rows} examples.")
print(f"- Examples look like this: {dataset['train'][0]}")

- The dataset has 557850 examples.
- Examples look like this: {'anchor': 'A person on a horse jumps over a broken down airplane.', 'positive': 'A person is outdoors, on a horse.', 'negative': 'A person is at a diner, ordering an omelette.'}


In [31]:
from torch.utils.data import DataLoader

train_dataset = dataset["train"]
eval_dataset = dataset["dev"]
test_dataset = dataset["test"]


In [40]:
from sentence_transformers.evaluation import TripletEvaluator

dev_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="all-nli-dev",
)

In [41]:
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="all-nli-test",
)

In [44]:
dev_evaluator(model)


{'all-nli-dev_cosine_accuracy': 0.887302577495575}

In [43]:
test_evaluator(model)

{'all-nli-test_cosine_accuracy': 0.9048267602920532}

In [32]:
from sentence_transformers import losses
from sentence_transformers.training_args import BatchSamplers

train_loss = losses.TripletLoss(model=model)

In [53]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/distilroberta-base-sentence-transformer",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="distilroberta-base-sentence-transformer",  # Will be used in W&B if `wandb` is installed
)

In [54]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=train_loss,
    evaluator=dev_evaluator,
)

In [55]:
trainer.train()

Step,Training Loss,Validation Loss,All-nli-dev Cosine Accuracy
100,0.5108,0.647751,0.947448
200,0.6647,0.644781,0.947904
300,0.6496,0.637041,0.948056
400,0.558,0.632136,0.948967
500,0.5273,0.629992,0.94836
600,0.5374,0.623408,0.949119
700,0.5382,0.621595,0.947297
800,0.5819,0.61898,0.94836
900,0.5833,0.611584,0.948967
1000,0.4391,0.6126,0.9476


TrainOutput(global_step=34866, training_loss=0.36080751075519246, metrics={'train_runtime': 11057.7559, 'train_samples_per_second': 50.449, 'train_steps_per_second': 3.153, 'total_flos': 0.0, 'train_loss': 0.36080751075519246, 'epoch': 1.0})

In [56]:
# save the best model
trainer.save_model()

In [57]:
 # push model to hub
trainer.model.push_to_hub("distilroberta-base-sentence-transformer_finetuned")

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

'https://huggingface.co/ravi259/distilroberta-base-sentence-transformer_finetuned/commit/f2b52c85af1afaa7d84e0a83b161c44f9dda91c2'