In [1]:
from datasets import load_dataset

train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns('idx')

In [2]:
train_dataset[3]

{'premise': 'How do you know? All this is their information again.',
 'hypothesis': 'This information belongs to them.',
 'label': 0}

In [3]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('bert-base-uncased')

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


In [4]:
from sentence_transformers import losses

train_loss = losses.SoftmaxLoss(
    model=embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)

In [5]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine",
)

In [6]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs=1, #1 for quickness
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=25,
    fp16=False, # can't use this percision for mps have to use default 32
    eval_steps=25,
    logging_steps=25,
)


In [7]:
from sentence_transformers.trainer import SentenceTransformerTrainer
import torch

#from accelerate import Accelerator

#accelerator = Accelerator(mixed_precision="no")  # Explicitly disable mixed precision
#args.local_rank = accelerator.local_rank  # Ensure local_rank is passed correctly

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

Column 'hypothesis' is at index 1, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
25,1.1151
50,1.1039
75,1.1093
100,1.0192
125,1.0209
150,1.0048
175,1.0411
200,0.9262
225,0.9197
250,0.986


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

RuntimeError: MPS backend out of memory (MPS allocated: 6.75 GB, other allocations: 11.31 GB, max allowed: 18.13 GB). Tried to allocate 89.42 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [9]:
!pip install mteb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting mteb
  Downloading mteb-1.29.3-py3-none-any.whl.metadata (33 kB)
Collecting datasets<3.0.0,>=2.19.0 (from mteb)
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting rich>=0.0.0 (from mteb)
  Downloading rich-13.9.4-py3-none-any.whl.metadata (18 kB)
Collecting pytrec-eval-terrier>=0.5.6 (from mteb)
  Downloading pytrec_eval_terrier-0.5.6-cp311-cp311-macosx_10_9_universal2.whl.metadata (777 bytes)
Collecting eval_type_backport>=0.0.0 (from mteb)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting polars>=0.20.22 (from mteb)
  Downloading polars-1.19.0-cp39-abi3-macosx_11_0_arm64.whl.metadata (14 kB)
Collecting fsspec<=2024.6.1,>=2023.1.0 (from fsspec[http]<=2024.6.1,>=2023.1.0->datasets<3.0.0,>=2.19.0->mteb)
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting markdown-it-py>=2.2.0 (from rich>=0.0.0->mteb)
  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (fro

In [6]:
from mteb import MTEB

evaluation = MTEB(tasks=["Banking77Classification"])

Passing task names as strings is deprecated and will be removed in the next release. Please use `tasks = mteb.get_tasks(tasks=[...])` method to get tasks instead.


In [7]:
results = evaluation.run(embedding_model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

### Cosine Loss (metrics)

In [6]:
from datasets import Dataset, load_dataset

train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns('idx')

# neutral/contradiction = 0, entailment = 1
mapping = {2: 0, 1: 0, 0: 1}
train_dataset = Dataset.from_dict({
    "sentence1": train_dataset["premise"],
    "sentence2": train_dataset["hypothesis"],
    "label": [float(mapping[label]) for label in train_dataset["label"]]
})

In [7]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [8]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding = SentenceTransformer("bert-base-uncased")

train_loss = losses.CosineSimilarityLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir="cosineloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=50,
    fp16=False,
    eval_steps=50,
    logging_steps=50,
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Step,Training Loss
50,0.2606
100,0.1926
150,0.1831
200,0.1727
250,0.1765
300,0.1582
350,0.1731
400,0.1628
450,0.1667


RuntimeError: MPS backend out of memory (MPS allocated: 8.64 GB, other allocations: 9.44 GB, max allowed: 18.13 GB). Tried to allocate 89.42 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

### Multiple negative ranking loss (MNR metrics) often refered as InfoNCE or NTXentLoss

In [1]:
import random
from tqdm import tqdm
from datasets import Dataset, load_dataset

mnli = load_dataset("glue", "mnli", split="train").select(range(50_000))
mnli = mnli.remove_columns('idx')
mnli = mnli.filter(lambda x: True if x["label"] == 0 else False)

train_dataset = {"anchor": [], "positive": [], "negative": []}
soft_negatives = mnli["hypothesis"]
random.shuffle(soft_negatives)
for row, soft_negative in tqdm(zip(mnli, soft_negatives)):
    train_dataset["anchor"].append(row["premise"])
    train_dataset["positive"].append(row["hypothesis"])
    train_dataset["negative"].append(soft_negative)
train_dataset = Dataset.from_dict(train_dataset)

16875it [00:00, 47163.60it/s]


In [2]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [3]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer("bert-base-uncased")

train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir="mnrloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=150,
    fp16=False,
    eval_steps=150,
    logging_steps=150,
)
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Step,Training Loss


RuntimeError: MPS backend out of memory (MPS allocated: 15.81 GB, other allocations: 2.30 GB, max allowed: 18.13 GB). Tried to allocate 44.06 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [1]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns('idx')

val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [3]:
# using a pretrained embedding model
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir="finetuned_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=False,
    eval_steps=100,
    logging_steps=100,
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

Column 'hypothesis' is at index 1, whereas a column with this name is usually expected at index 0. Note that the column order can be important for some losses, e.g. MultipleNegativesRankingLoss will always consider the first column as the anchor and the second as the positive, regardless of the dataset column names. Consider renaming the columns to match the expected order, e.g.:
dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,0.1618
200,0.113
300,0.1237
400,0.1195
500,0.1069
600,0.1027
700,0.1119
800,0.1039
900,0.1043
1000,0.1046


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=1563, training_loss=0.11080328623453777, metrics={'train_runtime': 416.4137, 'train_samples_per_second': 120.073, 'train_steps_per_second': 3.753, 'total_flos': 0.0, 'train_loss': 0.11080328623453777, 'epoch': 1.0})

In [4]:
evaluator(embedding_model) # 0.84 pearson_cosine is more accurate than others.

{'pearson_cosine': 0.8463356658567595, 'spearman_cosine': 0.8468585064081122}

In [1]:
# Gold Dataset represents Ground Truth
#  allows us to increase the size of datasets that you already have available
without the need to manually label hundreds of thousands of sentence pairs. 
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset
from sentence_transformers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader

dataset = load_dataset("glue", "mnli", split="train").select(range(2500)) # originally 10_000, I am on mps :(
mapping = {2: 0, 1: 0, 0: 1}

gold_examples = [
    InputExample(texts=[row["premise"], row["hypothesis"]], label=mapping[row["label"]]) for row in tqdm(dataset)
]
gold_dataloader = NoDuplicatesDataLoader(gold_examples, batch_size=32)
# panda dataframe for easier data handling
gold = pd.DataFrame(
    {
        "sentence1": dataset["premise"],
        "sentence2": dataset["hypothesis"],
        "label": [mapping[label] for label in dataset["label"]]
    }
)

100%|████████████████████████████████████████████████████| 2500/2500 [00:00<00:00, 67202.63it/s]


In [2]:
from sentence_transformers.cross_encoder import CrossEncoder

cross_encoder = CrossEncoder("bert-base-uncased", num_labels=2)
cross_encoder.fit(
    train_dataloader=gold_dataloader,
    epochs=1,
    show_progress_bar=True,
    warmup_steps=100,
    use_amp=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/78 [00:00<?, ?it/s]

In [5]:
silver = load_dataset("glue", "mnli", split="train").select(range(2500, 3500)) # originally 10_000-50_000 on mps :(
pairs = list(zip(silver["premise"], silver["hypothesis"]))

In [6]:
import numpy as np

output = cross_encoder.predict(pairs, apply_softmax=True, show_progress_bar=True)
silver = pd.DataFrame(
    {
        "sentence1": silver["premise"],
        "sentence2": silver["hypothesis"],
        "label": np.argmax(output, axis=1)
    }
)

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

If you do not have any additional unlabeled sentence pairs, you can randomly sample them from your original gold dataset.
This strategy, however, likely generates significantly more dissimilar than similar pairs. Instead, we can use a pretrained embedding model to embed all candidate sentence pairs and retrieve the top-k sentences for each input sentence using semantic search.

In [7]:
# combine our gold and silver data
data = pd.concat([gold, silver], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=["sentence1", "sentence2"], keep="first")
train_dataset = Dataset.from_pandas(data, preserve_index=False)

In [8]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [10]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

embedding_model = SentenceTransformer('bert-base-uncased')
train_loss =losses.CosineSimilarityLoss(model=embedding_model)
args = SentenceTransformerTrainingArguments(
    output_dir="augmented_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=False,
    eval_steps=100,
    logging_steps=100,
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


Step,Training Loss


RuntimeError: MPS backend out of memory (MPS allocated: 12.08 GB, other allocations: 5.99 GB, max allowed: 18.13 GB). Tried to allocate 89.42 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [12]:
# evaluator(embedding_model) on average we get pearson_cosine score of .71 with 20% of data

# Unsupervised Embedding

### Transformer-Based Sequential Denoising Auto-Encoder
(training with unlabeled data) used for very little or no labeled data

In [None]:
!pip install nltk

In [3]:
import nltk
nltk.download()
nltk.download("punkt")
nltk.download("punkt_tab")

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


Downloader>  d



Download which package (l=list; x=cancel)?


  Identifier>  punkt_tab


    Downloading package punkt_tab to /Users/mathias/nltk_data...
      Unzipping tokenizers/punkt_tab.zip.



---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------


Downloader>  q


[nltk_data] Downloading package punkt to /Users/mathias/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mathias/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [1]:
# create "noisy data"
from tqdm import tqdm
from datasets import Dataset, load_dataset
from sentence_transformers.datasets import DenoisingAutoEncoderDataset

mnli = load_dataset("glue", "mnli", split="train").select(range(2000)) # we want 25_000
flat_sentences = mnli["premise"] + mnli["hypothesis"]

damage_data = DenoisingAutoEncoderDataset(list(set(flat_sentences)))
train_dataset = {"damaged_sentence": [], "original_sentence": []}
for data in tqdm(damage_data):
    train_dataset["damaged_sentence"].append(data.texts[0])
    train_dataset["original_sentence"].append(data.texts[1])
train_dataset = Dataset.from_dict(train_dataset)
train_dataset[5]

100%|█████████████████████████████████████████████████████| 3989/3989 [00:00<00:00, 6921.10it/s]


{'damaged_sentence': "uh work on eighty Am've uh since was new",
 'original_sentence': "uh i work on my own i've got an nineteen eighty Trans Am that i've had uh since it was new"}

In [2]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

val_sts = load_dataset("glue", "stsb", split="validation")
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine"
)

In [3]:
from sentence_transformers import models, SentenceTransformer

word_embedding_model = models.Transformer("bert-base-uncased")
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [4]:
# encoding embedding and decoding output share the same weights
# from the noisy data we attempt to reconstruct it, and measure how accurate we are
from sentence_transformers import losses
train_loss = losses.DenoisingAutoEncoderLoss(embedding_model, tie_encoder_decoder=True)
train_loss.decoder = train_loss.decoder.to("mps")

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [5]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir="tsdae_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    fp16=False,
    eval_steps=100,
    logging_steps=100,
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
100,6.9448
200,4.9797


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=250, training_loss=5.68931201171875, metrics={'train_runtime': 175.9784, 'train_samples_per_second': 22.668, 'train_steps_per_second': 1.421, 'total_flos': 0.0, 'train_loss': 5.68931201171875, 'epoch': 1.0})

In [6]:
evaluator(embedding_model)

{'pearson_cosine': 0.6537921311994572, 'spearman_cosine': 0.6666186273004159}

### For unsupervised learning to work in a specific domain you have to use adaptive pretraining and adaptive fine-tuning (domain adaptation)

First, you can start with TSDAE to train an embedding model on
your target domain and then fine-tune it using either general supervised training or
Augmented SBERT.