<a href="https://colab.research.google.com/github/singhvis29/Hands_On_LLM_WR/blob/main/Ch_10_Creating_Text_Embedding_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# %%capture
# !pip install -q accelerate>=0.27.2 peft>=0.9.0 bitsandbytes>=0.43.0 transformers>=4.38.2 trl>=0.7.11 sentencepiece>=0.1.99
# !pip install -q sentence-transformers>=3.0.0 mteb>=1.1.2 datasets>=2.18.0 mteb

## Creating Embedding Model


### Data

In [3]:
from datasets import load_dataset

# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
train_dataset[2]

{'premise': 'One of our number will carry out your instructions minutely.',
 'hypothesis': 'A member of my team will execute your orders with immense precision.',
 'label': 0}

### Model

In [5]:
from sentence_transformers import SentenceTransformer

# Use a base model
embedding_model = SentenceTransformer('bert-base-uncased')



In [6]:
from sentence_transformers import losses

# Define the loss function. In soft-max loss, we will also need to explicitly set the number of labels.
train_loss = losses.SoftmaxLoss(
    model=embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)

### Evaluation

In [7]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine",
)

### Training

In [8]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

In [9]:
from sentence_transformers.trainer import SentenceTransformerTrainer

# Train embedding model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Currently logged in as: [33msinghvis929[0m ([33msinghvis929-cvs-health[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])
  return forward_call(*args, **kwargs)


Step,Training Loss
100,1.0791
200,0.9442
300,0.8844
400,0.8464
500,0.828
600,0.8367
700,0.8193
800,0.7964
900,0.78
1000,0.7735


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=1563, training_loss=0.8168740641666542, metrics={'train_runtime': 356.2622, 'train_samples_per_second': 140.346, 'train_steps_per_second': 4.387, 'total_flos': 0.0, 'train_loss': 0.8168740641666542, 'epoch': 1.0})

In [10]:
# Evaluate our trained model
evaluator(embedding_model)

  return forward_call(*args, **kwargs)


{'pearson_cosine': 0.4967024909662864, 'spearman_cosine': 0.557356293890196}

### MTEB

In [11]:
# !pip install mteb

In [16]:
# from mteb import MTEB

# # Choose evaluation task
# evaluation = MTEB(tasks=["Banking77Classification"])

# # Calculate results
# results = evaluation.run(embedding_model)
# results

In [17]:
# Empty and delete trainer/model
trainer.accelerator.clear()
del trainer, embedding_model

# Garbage collection and empty cache
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [18]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

### Loss Functions

#### Cosine Similarity Loss

In [19]:
from datasets import Dataset, load_dataset

# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

# (neutral/contradiction)=0 and (entailment)=1
mapping = {2: 0, 1: 0, 0:1}
train_dataset = Dataset.from_dict({
    "sentence1": train_dataset["premise"],
    "sentence2": train_dataset["hypothesis"],
    "label": [float(mapping[label]) for label in train_dataset["label"]]
})

In [20]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [21]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="cosineloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  return forward_call(*args, **kwargs)


Step,Training Loss
100,0.2325
200,0.1706
300,0.1724
400,0.1599
500,0.153
600,0.1599
700,0.1512
800,0.1572
900,0.1495
1000,0.1479


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


TrainOutput(global_step=1563, training_loss=0.15793544149368297, metrics={'train_runtime': 383.0362, 'train_samples_per_second': 130.536, 'train_steps_per_second': 4.081, 'total_flos': 0.0, 'train_loss': 0.15793544149368297, 'epoch': 1.0})

In [22]:
# Evaluate our trained model
evaluator(embedding_model)

  return forward_call(*args, **kwargs)


{'pearson_cosine': 0.7354101595017892, 'spearman_cosine': 0.7376301175873421}