<a href="https://colab.research.google.com/github/singhvis29/Hands_On_LLM_WR/blob/main/Ch_10_Creating_Text_Embedding_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# %%capture
# !pip install -q accelerate>=0.27.2 peft>=0.9.0 bitsandbytes>=0.43.0 transformers>=4.38.2 trl>=0.7.11 sentencepiece>=0.1.99
# !pip install -q sentence-transformers>=3.0.0 mteb>=1.1.2 datasets>=2.18.0 mteb

## Creating Embedding Model


### Data

In [3]:
from datasets import load_dataset

# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
train_dataset[2]

{'premise': 'One of our number will carry out your instructions minutely.',
 'hypothesis': 'A member of my team will execute your orders with immense precision.',
 'label': 0}

### Model

In [5]:
from sentence_transformers import SentenceTransformer

# Use a base model
embedding_model = SentenceTransformer('bert-base-uncased')



In [6]:
from sentence_transformers import losses

# Define the loss function. In soft-max loss, we will also need to explicitly set the number of labels.
train_loss = losses.SoftmaxLoss(
    model=embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)

### Evaluation

In [7]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine",
)

### Training

In [8]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

In [9]:
from sentence_transformers.trainer import SentenceTransformerTrainer

# Train embedding model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Currently logged in as: [33msinghvis929[0m ([33msinghvis929-cvs-health[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])
  return forward_call(*args, **kwargs)


Step,Training Loss
100,1.0791
200,0.9442
300,0.8844
400,0.8464
500,0.828
600,0.8367
700,0.8193
800,0.7964
900,0.78
1000,0.7735


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=1563, training_loss=0.8168740641666542, metrics={'train_runtime': 356.2622, 'train_samples_per_second': 140.346, 'train_steps_per_second': 4.387, 'total_flos': 0.0, 'train_loss': 0.8168740641666542, 'epoch': 1.0})

In [10]:
# Evaluate our trained model
evaluator(embedding_model)

  return forward_call(*args, **kwargs)


{'pearson_cosine': 0.4967024909662864, 'spearman_cosine': 0.557356293890196}

### MTEB

In [11]:
# !pip install mteb

In [16]:
# from mteb import MTEB

# # Choose evaluation task
# evaluation = MTEB(tasks=["Banking77Classification"])

# # Calculate results
# results = evaluation.run(embedding_model)
# results

In [17]:
# Empty and delete trainer/model
trainer.accelerator.clear()
del trainer, embedding_model

# Garbage collection and empty cache
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [18]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

### Loss Functions

#### Cosine Similarity Loss

In [19]:
from datasets import Dataset, load_dataset

# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

# (neutral/contradiction)=0 and (entailment)=1
mapping = {2: 0, 1: 0, 0:1}
train_dataset = Dataset.from_dict({
    "sentence1": train_dataset["premise"],
    "sentence2": train_dataset["hypothesis"],
    "label": [float(mapping[label]) for label in train_dataset["label"]]
})

In [20]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [21]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="cosineloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  return forward_call(*args, **kwargs)


Step,Training Loss
100,0.2325
200,0.1706
300,0.1724
400,0.1599
500,0.153
600,0.1599
700,0.1512
800,0.1572
900,0.1495
1000,0.1479


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=1563, training_loss=0.15793544149368297, metrics={'train_runtime': 383.0362, 'train_samples_per_second': 130.536, 'train_steps_per_second': 4.081, 'total_flos': 0.0, 'train_loss': 0.15793544149368297, 'epoch': 1.0})

In [22]:
# Evaluate our trained model
evaluator(embedding_model)

  return forward_call(*args, **kwargs)


{'pearson_cosine': 0.7354101595017892, 'spearman_cosine': 0.7376301175873421}

#### Multiple Negatives Ranking Loss

In [1]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [3]:
import random
from tqdm import tqdm
from datasets import Dataset, load_dataset

# # Load MNLI dataset from GLUE
mnli = load_dataset("glue", "mnli", split="train").select(range(50_000))
mnli = mnli.remove_columns("idx")
mnli = mnli.filter(lambda x: True if x['label'] == 0 else False)

# Prepare data and add a soft negative
train_dataset = {"anchor": [], "positive": [], "negative": []}
soft_negatives = mnli["hypothesis"]

In [5]:
soft_negatives_list = list(soft_negatives) # Convert Column to a list
random.shuffle(soft_negatives_list) # Shuffle the list

for row, soft_negative in tqdm(zip(mnli, soft_negatives_list)): # Use the shuffled list
    train_dataset["anchor"].append(row["premise"])
    train_dataset["positive"].append(row["hypothesis"])
    train_dataset["negative"].append(soft_negative)
train_dataset = Dataset.from_dict(train_dataset)
len(train_dataset)

16875it [00:01, 12815.49it/s]


16875

In [19]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from datasets import load_dataset

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')

# Convert sentence columns to lists to avoid TypeError with numpy integers in the evaluator
sentences1_list = list(val_sts["sentence1"])
sentences2_list = list(val_sts["sentence2"])
scores_list = [score/5 for score in val_sts["label"]] # Keep scores as a list

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=sentences1_list,
    sentences2=sentences2_list,
    scores=scores_list,
    main_similarity="cosine"
)

In [15]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss function
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="mnrloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msinghvis929[0m ([33msinghvis929-cvs-health[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


  return forward_call(*args, **kwargs)


Step,Training Loss
100,0.3312
200,0.1006
300,0.079
400,0.067
500,0.0725


  return forward_call(*args, **kwargs)


TrainOutput(global_step=528, training_loss=0.1275872692014232, metrics={'train_runtime': 567.7353, 'train_samples_per_second': 29.723, 'train_steps_per_second': 0.93, 'total_flos': 0.0, 'train_loss': 0.1275872692014232, 'epoch': 1.0})

In [18]:
# Evaluate our trained model

# Revert the change: do not convert sentences to lists.
# The EmbeddingSimilarityEvaluator expects dataset objects or similar.
# evaluator.sentences1 = list(evaluator.sentences1)
# evaluator.sentences2 = list(evaluator.sentences2)

evaluator(embedding_model)

  return forward_call(*args, **kwargs)


{'pearson_cosine': 0.8104705915373014, 'spearman_cosine': 0.8122392153392219}

## Fine-Tuning

In [20]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

### Supervised

In [24]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')

# Convert sentence columns to lists to avoid TypeError with numpy integers in the evaluator
sentences1_list = list(val_sts["sentence1"])
sentences2_list = list(val_sts["sentence2"])
scores_list = [score/5 for score in val_sts["label"]] # Keep scores as a list

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=sentences1_list,
    sentences2=sentences2_list,
    scores=scores_list,
    main_similarity="cosine"
)

In [22]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Loss function
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="finetuned_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])
  return forward_call(*args, **kwargs)


Step,Training Loss
100,0.1573
200,0.1105
300,0.1199
400,0.1188
500,0.1083
600,0.1011
700,0.1196
800,0.0987
900,0.1041
1000,0.1052


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=1563, training_loss=0.10937975250751791, metrics={'train_runtime': 115.2813, 'train_samples_per_second': 433.722, 'train_steps_per_second': 13.558, 'total_flos': 0.0, 'train_loss': 0.10937975250751791, 'epoch': 1.0})

In [25]:
# Evaluate our trained model
evaluator(embedding_model)

  return forward_call(*args, **kwargs)


{'pearson_cosine': 0.849510960782187, 'spearman_cosine': 0.8489011249816341}

In [26]:
# Evaluate the pre-trained model
original_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
evaluator(original_model)

  return forward_call(*args, **kwargs)


{'pearson_cosine': 0.8696194541131999, 'spearman_cosine': 0.8671631190200253}