<a href="https://colab.research.google.com/github/rahiakela/genai-research-and-practice/blob/main/hands-on-large-language-models/10_fine_tuning_text_embedding_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup <img src="https://colab.google/static/images/icons/colab.png" width=100>

If you are viewing this notebook on Google Colab (or any other cloud vendor), you need to **uncomment and run** the following codeblock to install the dependencies for this chapter:

---

💡 **NOTE**: We will want to use a GPU to run the examples in this notebook. In Google Colab, go to
**Runtime > Change runtime type > Hardware accelerator > GPU > GPU type > T4**.

---


In [1]:
%%capture
!pip install -q accelerate>=0.27.2 peft>=0.9.0 bitsandbytes>=0.43.0 trl>=0.7.11 sentencepiece>=0.1.99
!pip install -q mteb>=1.1.2 datasets>=2.18.0

In [2]:
%%capture

!pip install transformers==4.45.2 sentence-transformers==3.1.1

## **Supervised**

In [None]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [None]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Loss function
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="finetuned_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,0.1573
200,0.1105
300,0.1199
400,0.1188
500,0.1083
600,0.1011
700,0.1196
800,0.0986
900,0.1041
1000,0.1052


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=1563, training_loss=0.10938053045681632, metrics={'train_runtime': 134.8494, 'train_samples_per_second': 370.784, 'train_steps_per_second': 11.591, 'total_flos': 0.0, 'train_loss': 0.10938053045681632, 'epoch': 1.0})

In [None]:
# Evaluate our trained model
evaluator(embedding_model)

{'pearson_cosine': 0.8495124079681606,
 'spearman_cosine': 0.8488879958339021,
 'pearson_manhattan': 0.8516629484867085,
 'spearman_manhattan': 0.8481690834154256,
 'pearson_euclidean': 0.8525580678201216,
 'spearman_euclidean': 0.8488879958339021,
 'pearson_dot': 0.8495124038161317,
 'spearman_dot': 0.8488866168293903,
 'pearson_max': 0.8525580678201216,
 'spearman_max': 0.8488879958339021}

In [None]:
# Evaluate the pre-trained model
original_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
evaluator(original_model)

{'pearson_cosine': 0.8696194518832261,
 'spearman_cosine': 0.8671631197908374,
 'pearson_manhattan': 0.8670399003909525,
 'spearman_manhattan': 0.8663946139224048,
 'pearson_euclidean': 0.8678715924178552,
 'spearman_euclidean': 0.8671631197908374,
 'pearson_dot': 0.8696194534675574,
 'spearman_dot': 0.8671631197908374,
 'pearson_max': 0.8696194534675574,
 'spearman_max': 0.8671631197908374}

⚠️ **VRAM Clean-up**
* `Restart` the notebook in order to clean-up memory if you move on to the next training example.

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [None]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Let's use latest model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

# Loss function
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="finetuned_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
trainer.train()

dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,0.1433
200,0.0999
300,0.1025
400,0.1081
500,0.1008
600,0.0973
700,0.1064
800,0.0895
900,0.0988
1000,0.0997


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=1563, training_loss=0.09953958715144748, metrics={'train_runtime': 214.782, 'train_samples_per_second': 232.794, 'train_steps_per_second': 7.277, 'total_flos': 0.0, 'train_loss': 0.09953958715144748, 'epoch': 1.0})

In [None]:
# Evaluate our trained model
evaluator(embedding_model)

{'pearson_cosine': 0.8554223157032554,
 'spearman_cosine': 0.8553449301403165,
 'pearson_manhattan': 0.8567339288073884,
 'spearman_manhattan': 0.8541200777410832,
 'pearson_euclidean': 0.8577956371036369,
 'spearman_euclidean': 0.8553449301403165,
 'pearson_dot': 0.8554223171890211,
 'spearman_dot': 0.8553449301403165,
 'pearson_max': 0.8577956371036369,
 'spearman_max': 0.8553449301403165}

In [None]:
# Evaluate the pre-trained model
original_model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
evaluator(original_model)

{'pearson_cosine': 0.8769093039264211,
 'spearman_cosine': 0.8749513564823086,
 'pearson_manhattan': 0.8751285197515561,
 'spearman_manhattan': 0.8745437405426454,
 'pearson_euclidean': 0.8757108511901099,
 'spearman_euclidean': 0.8749513564823086,
 'pearson_dot': 0.8769093062424029,
 'spearman_dot': 0.8749513564823086,
 'pearson_max': 0.8769093062424029,
 'spearman_max': 0.8749513564823086}

## **Augmented SBERT**

**Step 1:** Fine-tune a cross-encoder

In [None]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset
from sentence_transformers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader

# Prepare a small set of 10000 documents for the cross-encoder
dataset = load_dataset("glue", "mnli", split="train").select(range(10_000))
mapping = {2: 0, 1: 0, 0:1}

# Data Loader
gold_examples = [
    InputExample(texts=[row["premise"], row["hypothesis"]], label=mapping[row["label"]])
    for row in tqdm(dataset)
]
gold_dataloader = NoDuplicatesDataLoader(gold_examples, batch_size=32)

# Pandas DataFrame for easier data handling
gold = pd.DataFrame(
    {
    'sentence1': dataset['premise'],
    'sentence2': dataset['hypothesis'],
    'label': [mapping[label] for label in dataset['label']]
    }
)

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

# Train a cross-encoder on the gold dataset
cross_encoder = CrossEncoder('bert-base-uncased', num_labels=2)
cross_encoder.fit(
    train_dataloader=gold_dataloader,
    epochs=1,
    show_progress_bar=True,
    warmup_steps=100,
    use_amp=False
)

**Step 2:** Create new sentence pairs

In [5]:
# Prepare the silver dataset by predicting labels with the cross-encoder
silver = load_dataset("glue", "mnli", split="train").select(range(10_000, 50_000))
pairs = list(zip(silver['premise'], silver['hypothesis']))

**Step 3:** Label new sentence pairs with the fine-tuned cross-encoder (silver dataset)

In [None]:
import numpy as np

# Label the sentence pairs using our fine-tuned cross-encoder
output = cross_encoder.predict(pairs, apply_softmax=True, show_progress_bar=True)
silver = pd.DataFrame(
    {
        "sentence1": silver["premise"],
        "sentence2": silver["hypothesis"],
        "label": np.argmax(output, axis=1)
    }
)

**Step 4:** Train a bi-encoder (SBERT) on the extended dataset (gold + silver dataset)

In [7]:
# Combine gold dataset + silver dataset
data = pd.concat([gold, silver], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=['sentence1', 'sentence2'], keep="first")
train_dataset = Dataset.from_pandas(data, preserve_index=False)

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [None]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="augmented_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

In [10]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
100,0.2139
200,0.1549
300,0.1434
400,0.1424
500,0.1365
600,0.1357
700,0.1359
800,0.1332
900,0.1348
1000,0.13


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=1563, training_loss=0.13995841155049135, metrics={'train_runtime': 470.9657, 'train_samples_per_second': 106.161, 'train_steps_per_second': 3.319, 'total_flos': 0.0, 'train_loss': 0.13995841155049135, 'epoch': 1.0})

In [11]:
# Evaluate our trained model
evaluator(embedding_model)

{'pearson_cosine': 0.6902064385460234,
 'spearman_cosine': 0.7069155163196936,
 'pearson_manhattan': 0.697413582703346,
 'spearman_manhattan': 0.6995833600141427,
 'pearson_euclidean': 0.6970010996048188,
 'spearman_euclidean': 0.6993577655524077,
 'pearson_dot': 0.6543270859073973,
 'spearman_dot': 0.6582544578271456,
 'pearson_max': 0.697413582703346,
 'spearman_max': 0.7069155163196936}

In [12]:
trainer.accelerator.clear()

[]

**Step 5**: Evaluate without silver dataset

In [None]:
# Combine gold + silver
data = pd.concat([gold], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=['sentence1', 'sentence2'], keep="first")
train_dataset = Dataset.from_pandas(data, preserve_index=False)

# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="gold_only_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

In [14]:
trainer.train()

Step,Training Loss
100,0.2268
200,0.1714
300,0.16


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=313, training_loss=0.1852452541692569, metrics={'train_runtime': 75.5263, 'train_samples_per_second': 132.404, 'train_steps_per_second': 4.144, 'total_flos': 0.0, 'train_loss': 0.1852452541692569, 'epoch': 1.0})

In [15]:
# Evaluate our trained model
evaluator(embedding_model)

{'pearson_cosine': 0.6209081847030504,
 'spearman_cosine': 0.6476127013728358,
 'pearson_manhattan': 0.6525141117001739,
 'spearman_manhattan': 0.6616018035535549,
 'pearson_euclidean': 0.6507260750725726,
 'spearman_euclidean': 0.6601421566387636,
 'pearson_dot': 0.5484143063839936,
 'spearman_dot': 0.5462674143515209,
 'pearson_max': 0.6525141117001739,
 'spearman_max': 0.6616018035535549}

Compared to using both the silver and gold datasets, using only the gold dataset reduces the performance of the model!

⚠️ **VRAM Clean-up**
* `Restart` the notebook in order to clean-up memory if you move on to the next training example.

In [16]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

## **Unsupervised Learning**

**Tranformer-based Denoising AutoEncoder (TSDAE)**

In [None]:
# Download additional tokenizer
import nltk
nltk.download('punkt')

In [None]:
from tqdm import tqdm
from datasets import Dataset, load_dataset
from sentence_transformers.datasets import DenoisingAutoEncoderDataset

# Create a flat list of sentences
mnli = load_dataset("glue", "mnli", split="train").select(range(25_000))
flat_sentences = mnli["premise"] + mnli["hypothesis"]

# Add noise to our input data
damaged_data = DenoisingAutoEncoderDataset(list(set(flat_sentences)))

# Create dataset
train_dataset = {"damaged_sentence": [], "original_sentence": []}
for data in tqdm(damaged_data):
    train_dataset["damaged_sentence"].append(data.texts[0])
    train_dataset["original_sentence"].append(data.texts[1])
train_dataset = Dataset.from_dict(train_dataset)

In [None]:
train_dataset[0]

In [None]:
# # Choose a different deletion ratio
# flat_sentences = list(set(flat_sentences))
# damaged_data = DenoisingAutoEncoderDataset(
#     flat_sentences,
#     noise_fn=lambda s: DenoisingAutoEncoderDataset.delete(s, del_ratio=0.6)
# )

In [None]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [None]:
from sentence_transformers import models, SentenceTransformer

# Create your embedding model
word_embedding_model = models.Transformer('bert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:
from sentence_transformers import losses

# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(
    embedding_model, tie_encoder_decoder=True
)
train_loss.decoder = train_loss.decoder.to("cuda")

In [None]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="tsdae_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

In [None]:
# Evaluate our trained model
evaluator(embedding_model)

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()