# Creating Text Embedding Models

## SBERT


### Generating Contrastive Examples


In [None]:
# Dataset : GLUE [Task -> MNLI]
from datasets import load_dataset


In [None]:
# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction

train_dataset = load_dataset(
    "glue", "mnli", split = "train"
).select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

In [None]:
train_dataset[2]

### Train Model

In [None]:
# Using BERT base model as pretrained transformer that serves as embeddings individual words
import torch
from sentence_transformers import SentenceTransformer

# Use a base model
embedding_model = SentenceTransformer('bert-base-uncased')


In [None]:
# Loss function - Softmax
# Using BERT base model as pretrained transformer that serves as embeddings individual words

from sentence_transformers import losses

# Define the loss function. In softmax loss we will also need to explicitly set the number of labels.
train_loss = losses.SoftmaxLoss(
    model = embedding_model,
    sentence_embedding_dimension = embedding_model.get_sentence_embedding_dimension(),
    num_labels = 3
)

In [None]:
# Perform evaluation using Semantic Textual Similarity Benchmark (STSB)
# Process STSB data to make sure all values are between 0 and 1

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for STSB
val_sts = load_dataset("glue", "stsb", split = "validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1 = val_sts["sentence1"],
    sentences2 = val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity = "cosine"
)

In [None]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# defining the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir = "base_embedding_model",
    num_train_epochs = 1,
    per_device_train_batch_size = 32,
    warmup_steps = 100,
    fp16 = False,
    eval_steps = 100,
    logging_steps = 100,
)

In [None]:
# Now that we have defined our data, embedding model, loss, and evaluator, we can
# start training our model. We can do that using SentenceTransformerTrainer

from sentence_transformers.trainer import SentenceTransformerTrainer

# Train embedding model
trainer = SentenceTransformerTrainer(
    model = embedding_model,
    args = args,
    train_dataset = train_dataset,
    loss = train_loss,
    evaluator = evaluator
)
trainer.train()

In [None]:
# Evaluate our trained model
evaluator(embedding_model)

## Loss Functions


### Cosine Similarity Loss

In [9]:
from datasets import Dataset, load_dataset

# Load MNLI dataset from GLUE
# 0 - Entailment ; 1 -> Neutral ; 2 -> Contradiction
train_dataset = load_dataset(
    "glue", "mnli", split = "train"
).select(range(50_000))

train_dataset = train_dataset.remove_columns("idx")


In [11]:
# (Neutral/Contradiction) -> 0 ; (Entailment) -> 1;
mapping = {2:0, 1:0, 0:1}
train_dataset = Dataset.from_dict({
    "sentence1": train_dataset["premise"],
    "sentence2" : train_dataset["hypothesis"],
    "label" : [float(mapping[label]) for label in train_dataset["label"]]
})

In [13]:
# Creating our evaluator

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset("glue", "stsb", split = "validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1 = val_sts["sentence1"],
    sentences2 = val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity = "cosine"
)

In [14]:
# Using a Cosine Similarity Loss Function

from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define model
embedding_model = SentenceTransformer("bert-base-uncased")

# Loss Function
train_loss = losses.CosineSimilarityLoss(model = embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir = "cosineloss_embedding_model",
    num_train_epochs = 1,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    warmup_steps = 100,
    fp16 = False,
    eval_steps = 100,
    logging_steps = 100
)

# Train model
trainer =  SentenceTransformerTrainer(
    model = embedding_model,
    args = args,
    train_dataset = train_dataset,
    loss = train_loss,
    evaluator = evaluator
)

trainer.train()

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

{'loss': 0.2306, 'grad_norm': 1.4475200176239014, 'learning_rate': 5e-05, 'epoch': 0.06}
{'loss': 0.1707, 'grad_norm': 1.3311426639556885, 'learning_rate': 4.6582365003417636e-05, 'epoch': 0.13}
{'loss': 0.1703, 'grad_norm': 1.24318265914917, 'learning_rate': 4.316473000683528e-05, 'epoch': 0.19}
{'loss': 0.1583, 'grad_norm': 1.0497990846633911, 'learning_rate': 3.9747095010252904e-05, 'epoch': 0.26}
{'loss': 0.1518, 'grad_norm': 1.4255374670028687, 'learning_rate': 3.632946001367054e-05, 'epoch': 0.32}
{'loss': 0.1571, 'grad_norm': 1.2196274995803833, 'learning_rate': 3.291182501708818e-05, 'epoch': 0.38}
{'loss': 0.1511, 'grad_norm': 1.109902262687683, 'learning_rate': 2.9494190020505813e-05, 'epoch': 0.45}
{'loss': 0.1557, 'grad_norm': 1.4513516426086426, 'learning_rate': 2.6076555023923443e-05, 'epoch': 0.51}
{'loss': 0.1478, 'grad_norm': 1.866289496421814, 'learning_rate': 2.2658920027341084e-05, 'epoch': 0.58}
{'loss': 0.1461, 'grad_norm': 1.0433378219604492, 'learning_rate': 1.9

TrainOutput(global_step=1563, training_loss=0.1568457133024073, metrics={'train_runtime': 799.1686, 'train_samples_per_second': 62.565, 'train_steps_per_second': 1.956, 'total_flos': 0.0, 'train_loss': 0.1568457133024073, 'epoch': 1.0})

In [15]:
# Evaluate our trained model
evaluator(embedding_model)

{'pearson_cosine': 0.7228620455159176, 'spearman_cosine': 0.7249475950906865}

## Multiple Negative Ranking Loss


In [1]:
import random
from tqdm import tqdm
from datasets import Dataset, load_dataset

In [None]:
# Load MNLI dataset from GLUE
mnli = load_dataset("glue", "mnli", split = "train").select(range(50_000))
mnli = mnli.remove_columns("idx")
mnli = mnli.filter(lambda x: True if x["label"] == 0 else False)
# mnli["hypothesis"]

['You lose the things to the following level if the people recall.',
 'A member of my team will execute your orders with immense precision.',
 'This information belongs to them.',
 "I'm upset that my walkman broke and now I have to turn the stereo up really loud.",
 "Slate had an opinion on Jackson's findings.",
 'I burst through the doors and fell down.',
 'Problems in data synthesis.',
 'You can see that on television, as well.',
 'We want to identify practices commonly used by agencies in the last 5 years',
 'The other men were shuffled around.',
 'It has been very intriguing.',
 'He returned slowly to the bunkhouse.',
 'The postal service could deliver less frequently.',
 'Another shift in the economy was found to be more nutritious.',
 'The rule has data collection requirements which aid the EPA to realize their emission control goals.',
 'The remains of Voltaire and Rousseau, Hugo and Zola, assassinated Socialist leader Jean Jaurys, and Louis Braille are all interred in the crypt

In [6]:
# Prepare data and add a soft negative
train_dataset = {
    "anchor": [],
    "positive": [],
    "negative": []
}
soft_negatives = mnli["hypothesis"]

random.shuffle(soft_negatives)

for row, soft_negative in tqdm(zip(mnli, soft_negatives)):
    train_dataset["anchor"].append(row["premise"])
    train_dataset["positive"].append(row["hypothesis"])
    train_dataset["negative"].append(soft_negative)

train_dataset = Dataset.from_dict(train_dataset)



16875it [00:00, 59228.15it/s]


In [7]:
# Let's define the evaluator

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset("glue", "stsb", split = "validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1 = val_sts["sentence1"],
    sentences2 = val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity = "cosine"
)

In [8]:
# We train the model as before but with MNR loss instead

from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

In [10]:
# Define the model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss Function
train_loss = losses.MultipleNegativesRankingLoss(model = embedding_model)

# Define the training Arguments
args = SentenceTransformerTrainingArguments(
    output_dir = "mnrloss_embedding_model",
    num_train_epochs = 1,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    warmup_steps = 100,
    fp16 = False,
    eval_steps = 100,
    logging_steps = 100
)


No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


In [11]:

# Train the model
trainer = SentenceTransformerTrainer(
    model = embedding_model,
    args = args,
    train_dataset = train_dataset,
    loss = train_loss,
    evaluator = evaluator
)

trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  0%|          | 0/528 [00:00<?, ?it/s]

{'loss': 0.3065, 'grad_norm': 5.35787296295166, 'learning_rate': 5e-05, 'epoch': 0.19}
{'loss': 0.1064, 'grad_norm': 4.354597568511963, 'learning_rate': 3.831775700934579e-05, 'epoch': 0.38}
{'loss': 0.0789, 'grad_norm': 3.512991189956665, 'learning_rate': 2.663551401869159e-05, 'epoch': 0.57}
{'loss': 0.0639, 'grad_norm': 1.0885411500930786, 'learning_rate': 1.4953271028037382e-05, 'epoch': 0.76}
{'loss': 0.0645, 'grad_norm': 5.52611780166626, 'learning_rate': 3.2710280373831774e-06, 'epoch': 0.95}
{'train_runtime': 321.8728, 'train_samples_per_second': 52.428, 'train_steps_per_second': 1.64, 'train_loss': 0.12088996804121768, 'epoch': 1.0}


TrainOutput(global_step=528, training_loss=0.12088996804121768, metrics={'train_runtime': 321.8728, 'train_samples_per_second': 52.428, 'train_steps_per_second': 1.64, 'total_flos': 0.0, 'train_loss': 0.12088996804121768, 'epoch': 1.0})

In [12]:
# Evaluate our trained model
evaluator(embedding_model)

{'pearson_cosine': 0.8023413256058616, 'spearman_cosine': 0.8056238363922879}

## Fine-Tuning an Embedding Model

### Supervised

In [8]:
import gc
import torch

gc.collect()
torch.mps.empty_cache()

In [9]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

In [10]:
# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction

train_dataset = load_dataset(
    "glue", "mnli", split = "train"
).select(range(50_000))

train_dataset

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 50000
})

In [11]:
train_dataset = train_dataset.remove_columns("idx")

In [12]:
# Create an embedding similarity evaluator for stsb
val_sts = load_dataset("glue", "stsb", split = "validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1 = val_sts["sentence1"],
    sentences2 = val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity = "cosine"
)

In [13]:
# Training the model

from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments


In [15]:
# Define the model
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Loss Function
train_loss = losses.MultipleNegativesRankingLoss(model = embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir = "finetuned_embedding_model",
    num_train_epochs = 1,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    warmup_steps = 100,
    fp16 = False,
    eval_steps = 100,
    logging_steps = 100
)

In [16]:
# Train model
trainer = SentenceTransformerTrainer(
    model = embedding_model,
    args = args,
    train_dataset = train_dataset,
    loss = train_loss,
    evaluator = evaluator
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

Column 'hypothesis' is at index 1, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


{'loss': 0.1618, 'grad_norm': 3.7395999431610107, 'learning_rate': 5e-05, 'epoch': 0.06}
{'loss': 0.113, 'grad_norm': 3.6550345420837402, 'learning_rate': 4.6582365003417636e-05, 'epoch': 0.13}
{'loss': 0.1237, 'grad_norm': 2.660911798477173, 'learning_rate': 4.316473000683528e-05, 'epoch': 0.19}
{'loss': 0.1195, 'grad_norm': 2.34393572807312, 'learning_rate': 3.9747095010252904e-05, 'epoch': 0.26}
{'loss': 0.1069, 'grad_norm': 5.350363731384277, 'learning_rate': 3.632946001367054e-05, 'epoch': 0.32}
{'loss': 0.1027, 'grad_norm': 1.264941930770874, 'learning_rate': 3.291182501708818e-05, 'epoch': 0.38}
{'loss': 0.1119, 'grad_norm': 2.589830160140991, 'learning_rate': 2.9494190020505813e-05, 'epoch': 0.45}
{'loss': 0.1039, 'grad_norm': 3.024691581726074, 'learning_rate': 2.6076555023923443e-05, 'epoch': 0.51}
{'loss': 0.1043, 'grad_norm': 2.005444288253784, 'learning_rate': 2.2658920027341084e-05, 'epoch': 0.58}
{'loss': 0.1046, 'grad_norm': 4.572895050048828, 'learning_rate': 1.9241285

TrainOutput(global_step=1563, training_loss=0.11080329965797664, metrics={'train_runtime': 187.8416, 'train_samples_per_second': 266.182, 'train_steps_per_second': 8.321, 'total_flos': 0.0, 'train_loss': 0.11080329965797664, 'epoch': 1.0})

In [17]:
# Evaluate our trained model
evaluator(embedding_model)

{'pearson_cosine': 0.8463357002991696, 'spearman_cosine': 0.8468585071608757}

### Augmented SBERT

In [1]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset
from sentence_transformers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader


In [2]:
# Prepare a small set of 10_000 documents for the cross-encoder
dataset = load_dataset("glue", "mnli", split = "train").select(range(10_000))
mapping = {2:0, 1:0, 0:1}

In [3]:
# Data Loader
gold_examples = [
    InputExample(texts = [row["premise"], row["hypothesis"]], label = mapping[row["label"]])
    for row in tqdm(dataset)
]

gold_dataloader = NoDuplicatesDataLoader(gold_examples, batch_size = 32)

# Pandas.DataFrame for easier data handling
gold = pd.DataFrame(
    {
        "sentence1": dataset["premise"],
        "sentence2": dataset["hypothesis"],
        "label" : [mapping[label] for label in dataset["label"]]
    }
)

100%|██████████| 10000/10000 [00:00<00:00, 73585.06it/s]


**# STEP-1 : Using this gold_dataset we train our cross-encoder**

In [6]:
from sentence_transformers.cross_encoder import CrossEncoder

In [10]:
# Train a cross_encoder on the gold dataset
cross_encoder = CrossEncoder("bert-base-uncased", num_labels = 2)
cross_encoder.fit(
    train_dataloader = gold_dataloader,
    epochs = 1,
    show_progress_bar = True,
    warmup_steps = 100,
    use_amp = False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/312 [00:00<?, ?it/s]

**STEP-2**
After training our cross-encoder, we use the remaining 400,000 sentence pairs (from our original dataset of 50,000 sentence pairs) as our silver dataset.

In [11]:
# Prepare the silver dataset by predicting labels with the cross-encoder
silver = load_dataset(
    "glue", "mnli", split = "train"
).select(range(10_000, 50_000))
pairs = list(zip(silver["premise"], silver["hypothesis"]))

Using the latest cached version of the dataset since glue couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'mnli' at /Users/qbit-glitch/.cache/huggingface/datasets/glue/mnli/0.0.0/bcdcba79d07bc864c1c254ccfcedcce55bcc9a8c (last modified on Sat Mar 15 18:27:34 2025).


**Step-3**: We will use our fine-tuned cross-encoder to label the sentence pairs


In [12]:
import numpy as np

# Label the sentence pairs using our fine-tuned corss-encoder
output = cross_encoder.predict(
    pairs, apply_softmax = True,
    show_progress_bar = True
)

silver = pd.DataFrame(
    {
        "sentence1" : silver["premise"],
        "sentence2": silver["hypothesis"],
        "label" : np.argmax(output, axis = 1)
    }
)

Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

Now that we have a silver and gold dataset, we simply combine them and train our embedding model.


In [13]:
# Combine gold + silver
data = pd.concat([gold, silver], ignore_index = True, axis = 0)
data = data.drop_duplicates(subset = ["sentence1", "sentence2"], keep = "first")
train_dataset = Dataset.from_pandas(data, preserve_index = False)

In [14]:
# Defining our evaluator
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

In [16]:
# Create an embedding similarity evaluator for stsb
val_sts = load_dataset("glue", "stsb", split = "validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1 = val_sts["sentence1"],
    sentences2 = val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity = "cosine"
)

We train the model as before except now we use the augmented dataset.

In [17]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments


In [18]:
# Define the model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss Function
train_loss = losses.CosineSimilarityLoss(model = embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir = "augmented_embedding_model",
    num_train_epochs = 1,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    warmup_steps = 100,
    fp16 = False,
    eval_steps = 100,
    logging_steps = 100
)

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


In [19]:
# Train the model
trainer = SentenceTransformerTrainer(
    model = embedding_model,
    args = args,
    train_dataset = train_dataset,
    loss = train_loss,
    evaluator = evaluator
)

trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  0%|          | 0/1563 [00:00<?, ?it/s]

{'loss': 0.2177, 'grad_norm': 1.6327018737792969, 'learning_rate': 5e-05, 'epoch': 0.06}
{'loss': 0.159, 'grad_norm': 1.3607633113861084, 'learning_rate': 4.6582365003417636e-05, 'epoch': 0.13}
{'loss': 0.1417, 'grad_norm': 1.288422703742981, 'learning_rate': 4.316473000683528e-05, 'epoch': 0.19}
{'loss': 0.1435, 'grad_norm': 1.1119270324707031, 'learning_rate': 3.9747095010252904e-05, 'epoch': 0.26}
{'loss': 0.1409, 'grad_norm': 1.0912528038024902, 'learning_rate': 3.632946001367054e-05, 'epoch': 0.32}
{'loss': 0.1376, 'grad_norm': 1.5323593616485596, 'learning_rate': 3.291182501708818e-05, 'epoch': 0.38}
{'loss': 0.1351, 'grad_norm': 1.212894320487976, 'learning_rate': 2.9494190020505813e-05, 'epoch': 0.45}
{'loss': 0.1323, 'grad_norm': 1.1308387517929077, 'learning_rate': 2.6076555023923443e-05, 'epoch': 0.51}
{'loss': 0.1366, 'grad_norm': 1.2236522436141968, 'learning_rate': 2.2658920027341084e-05, 'epoch': 0.58}
{'loss': 0.1337, 'grad_norm': 1.0599873065948486, 'learning_rate': 1.

TrainOutput(global_step=1563, training_loss=0.14130667761511628, metrics={'train_runtime': 738.9236, 'train_samples_per_second': 67.663, 'train_steps_per_second': 2.115, 'total_flos': 0.0, 'train_loss': 0.14130667761511628, 'epoch': 1.0})

In [20]:
evaluator(embedding_model)

{'pearson_cosine': 0.6940822174538952, 'spearman_cosine': 0.7114926647534681}

## Unsupervised Learning: Transformer Based Sequential Denoising Auto-Encoder

In [2]:
# Download additional Tokenizer
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/qbit-
[nltk_data]     glitch/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/qbit-
[nltk_data]     glitch/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# Create flat sentences from our data and remove any labels that we have to mimic an unsupervised setting

from tqdm import tqdm
from datasets import Dataset, load_dataset
from sentence_transformers.datasets import DenoisingAutoEncoderDataset


In [4]:
# Create a flat list of Sentences
mnli = load_dataset("glue", "mnli", split = "train").select(range(25_000))
flat_sentences = mnli["premise"] + mnli["hypothesis"]

# Add noise to out input data
damaged_data = DenoisingAutoEncoderDataset(list(set(flat_sentences)))

# Create dataset
train_dataset = {"damaged_sentence" : [], "original_sentence":[]}

for data in tqdm(damaged_data):
    train_dataset["damaged_sentence"].append(data.texts[0])
    train_dataset["original_sentence"].append(data.texts[1])

train_dataset = Dataset.from_dict(train_dataset)

100%|██████████| 48353/48353 [00:03<00:00, 15319.19it/s]


In [5]:
train_dataset[0]

{'damaged_sentence': 'I at to salaries',
 'original_sentence': "I'm looking at ways to raise salaries."}

In [8]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset("glue", "stsb", split = "validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1 = val_sts["sentence1"],
    sentences2 = val_sts["sentence2"],
    scores = [score/5 for score in val_sts["label"]],
    main_similarity = "cosine"
)

Next, we run the training as before but with the [CLS] token as the pooling strategy instead of the mean pooling of the token embeddings. In the TSDAE paper, this was shown to be more effective since mean pooling loses the position information, which is not the case when using the [CLS] token:

In [9]:
from sentence_transformers import models, SentenceTransformer

# Create your embedding model
word_embedding_model = models.Transformer("bert-base-uncased")
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
embedding_model = SentenceTransformer(modules = [word_embedding_model, pooling_model])

Using our sentence pairs, we will need a loss function that attempts to reconstruct the original sentence using the noise sentence, namely DenoisingAutoEncoderLoss. By doing so, it will learn how to accurately represent the data. It is similar to masking but without knowing where the actual masks are.

In [10]:
from sentence_transformers import losses

# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(
    embedding_model, tie_encoder_decoder = True
)

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

Finally, training our model works the same as we have seen several times before but we lower the batch size as memory increases with this loss function.

In [11]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

In [12]:
# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir = "tsdae_embedding_model",
    num_train_epochs = 1,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    warmup_steps = 100,
    fp16 = False,
    eval_steps = 100,
    logging_steps = 100
)

In [13]:
# Train model
trainer = SentenceTransformerTrainer(
    model = embedding_model,
    args = args,
    train_dataset = train_dataset,
    loss = train_loss,
    evaluator = evaluator
)

trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

  0%|          | 0/3023 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


{'loss': 7.1927, 'grad_norm': 9.87906265258789, 'learning_rate': 5e-05, 'epoch': 0.03}
{'loss': 5.0671, 'grad_norm': 7.317001819610596, 'learning_rate': 4.8289428669175505e-05, 'epoch': 0.07}
{'loss': 4.6236, 'grad_norm': 8.491228103637695, 'learning_rate': 4.657885733835101e-05, 'epoch': 0.1}
{'loss': 4.5012, 'grad_norm': 9.565860748291016, 'learning_rate': 4.486828600752652e-05, 'epoch': 0.13}
{'loss': 4.4797, 'grad_norm': 7.688411235809326, 'learning_rate': 4.315771467670202e-05, 'epoch': 0.17}
{'loss': 4.4926, 'grad_norm': 8.716361045837402, 'learning_rate': 4.1447143345877524e-05, 'epoch': 0.2}
{'loss': 4.4358, 'grad_norm': 16.072032928466797, 'learning_rate': 3.973657201505303e-05, 'epoch': 0.23}
{'loss': 4.4552, 'grad_norm': 82.69937133789062, 'learning_rate': 3.802600068422854e-05, 'epoch': 0.26}
{'loss': 4.4781, 'grad_norm': 14.16832160949707, 'learning_rate': 3.631542935340404e-05, 'epoch': 0.3}
{'loss': 4.5669, 'grad_norm': 10.393641471862793, 'learning_rate': 3.460485802257

TrainOutput(global_step=3023, training_loss=4.472961895204797, metrics={'train_runtime': 959.9004, 'train_samples_per_second': 50.373, 'train_steps_per_second': 3.149, 'total_flos': 0.0, 'train_loss': 4.472961895204797, 'epoch': 1.0})

In [14]:
# Evaluate our trained model
evaluator(embedding_model)

{'pearson_cosine': 0.7436534295971304, 'spearman_cosine': 0.7550700404676484}