Fine-Tuning Sentence Transformers for Embedding Search

In [1]:
# Install dependencies
!pip install -U "sentence-transformers[train]" accelerate datasets

Collecting sentence-transformers[train]
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m227.1/227.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m309.4/309.4 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m547.8/547.8 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 

In [2]:
# Import dependencies
import os
import json
import torch
import datasets
import pandas as pd
from torch.utils.data import DataLoader
from sentence_transformers import (
    SentenceTransformer, models,
    losses, util,
    InputExample, evaluation,
    SentenceTransformerTrainingArguments, SentenceTransformerTrainer
)
from accelerate import Accelerator
from datasets import load_dataset

In [3]:
# Load the dataset
data = load_dataset('sentence-transformers/stsb')
train_data = data['train'].select(range(100))
val_data = data['validation'].select(range(100, 140))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/142k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/108k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

In [4]:
# Example data from 5th record (taking randomly to just display)
print("Sentence 1: ", train_data['sentence1'][5], "\nSentence 2: ", train_data['sentence2'][5], "\nScore: ", train_data['score'][5])

Sentence 1:  Some men are fighting. 
Sentence 2:  Two men are fighting. 
Score:  0.85


In [5]:
# Get number of GPUs working
accelerator = Accelerator()
print(f"Using GPUs: {accelerator.num_processes}")

# Sentence Transformer BERT Model
word_embedding_model = models.Transformer('sentence-transformers/all-MiniLM-L6-v2')

# Applying pooling on final layer
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Define loss
loss = losses.CoSENTLoss(model)

# Define evaluator for evaluation
evaluator = evaluation.EmbeddingSimilarityEvaluator(
        sentences1=val_data['sentence1'],
        sentences2=val_data['sentence2'],
        scores=val_data['score'],
        main_similarity=evaluation.SimilarityFunction.COSINE,
        name="sts-dev"
    )


# Training arguments
training_args = SentenceTransformerTrainingArguments(
        output_dir='./sbert-checkpoint', # Save checkpoints
        num_train_epochs=10,
        seed=33,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        fp16=True, # Loading model in mixed-precision
        warmup_ratio=0.1,
        evaluation_strategy="steps",
        eval_steps=2,
        save_total_limit=2,
        load_best_model_at_end=True,
        save_only_model=True,
        greater_is_better=True
    )


# Train model
trainer = SentenceTransformerTrainer(
        model=model,
        evaluator=evaluator,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        loss=loss
    )
trainer.train()



Using GPUs: 1




config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine,Sts-dev Pearson Manhattan,Sts-dev Spearman Manhattan,Sts-dev Pearson Euclidean,Sts-dev Spearman Euclidean,Sts-dev Pearson Dot,Sts-dev Spearman Dot,Sts-dev Pearson Max,Sts-dev Spearman Max
2,No log,4.46436,0.91796,0.93662,0.908787,0.936715,0.91136,0.939554,0.875694,0.903585,0.91796,0.939554
4,No log,4.46436,0.91796,0.93662,0.908787,0.936715,0.91136,0.939554,0.875694,0.903585,0.91796,0.939554
6,No log,4.471012,0.918347,0.93662,0.909084,0.936715,0.911646,0.939554,0.875491,0.903585,0.918347,0.939554
8,No log,4.497251,0.918922,0.93662,0.909124,0.936715,0.911655,0.939554,0.875265,0.902828,0.918922,0.939554
10,No log,4.537925,0.919516,0.936242,0.908953,0.936242,0.911444,0.939554,0.875122,0.905573,0.919516,0.939554
12,No log,4.596074,0.919946,0.936242,0.908969,0.935863,0.911413,0.936999,0.874573,0.909833,0.919946,0.936999
14,No log,4.633837,0.919891,0.93662,0.908522,0.933686,0.910954,0.935863,0.874789,0.909454,0.919891,0.93662
16,No log,4.676612,0.920313,0.935958,0.90869,0.932834,0.91101,0.935768,0.875426,0.907277,0.920313,0.935958
18,No log,4.727691,0.920754,0.935011,0.909423,0.930278,0.911603,0.935768,0.875841,0.905478,0.920754,0.935768
20,No log,4.774985,0.920835,0.929805,0.910401,0.931698,0.912435,0.934632,0.876371,0.90652,0.920835,0.934632


TrainOutput(global_step=130, training_loss=2.112596951998197, metrics={'train_runtime': 22.7252, 'train_samples_per_second': 44.004, 'train_steps_per_second': 5.721, 'total_flos': 0.0, 'train_loss': 2.112596951998197, 'epoch': 10.0})

In [6]:
# save the model
model.save_pretrained("./sbert-model/")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [7]:
# List of products
products = [
    "Apple iPhone 15 (256GB) | Silver",
    "Nike Air Max 2024 | Blue/White",
    "Samsung Galaxy S24 Ultra (512GB) | Phantom Black",
    "Sony PlayStation 5 Console | Digital Edition",
    "Dell XPS 13 Laptop | Intel i7, 16GB RAM, 512GB SSD",
    "Fitbit Charge 6 | Midnight Blue",
    "Bose QuietComfort 45 Headphones | Triple Black",
    "Canon EOS R6 Camera | 20.1 MP Mirrorless",
    "Microsoft Surface Pro 9 | Intel i5, 8GB RAM, 256GB SSD",
    "Adidas Ultraboost 21 Running Shoes | Core Black",
    "Amazon Kindle Paperwhite | 32GB, Waterproof",
    "LG OLED65C1PUB 65\" 4K Smart TV",
    "Garmin Forerunner 955 Smartwatch | Slate Grey",
    "Google Nest Thermostat | Charcoal",
    "KitchenAid Stand Mixer | 5-Quart, Empire Red",
    "Dyson V11 Torque Drive Cordless Vacuum",
    "JBL Charge 5 Portable Bluetooth Speaker | Squad",
    "Panasonic Lumix GH5 Camera | 20.3 MP, 4K Video",
    "Apple MacBook Pro 14\" | M1 Pro, 16GB RAM, 1TB SSD",
    "Under Armour HeatGear Compression Shirt | Black/Red"
]

In [8]:
# Load fine-tuned model
model = SentenceTransformer('./sbert-model')

In [9]:
# Convert product names into embeddings
product_data = model.encode(products, convert_to_tensor=True)

In [10]:
# Function to get the user input and returns the top products along with their Cosine-Similarity scores
def search():
    query = input("Enter Query:\n")
    query_embeddings = model.encode([query], convert_to_tensor=True)
    hits = util.semantic_search(query_embeddings, product_data,
                                score_function=util.cos_sim)

    for i in range(5):
        best_search_term_id, best_search_term_core = hits[0][i]['corpus_id'], hits[0][i]['score']
        print("\nTop result: ", products[best_search_term_id])
        print("Score: ", best_search_term_core)

In [11]:
# Run the function
search()

Enter Query:
Samsung Galaxy S24

Top result:  Samsung Galaxy S24 Ultra (512GB) | Phantom Black
Score:  0.628572940826416

Top result:  Apple iPhone 15 (256GB) | Silver
Score:  0.4340556263923645

Top result:  Apple MacBook Pro 14" | M1 Pro, 16GB RAM, 1TB SSD
Score:  0.28146082162857056

Top result:  Dell XPS 13 Laptop | Intel i7, 16GB RAM, 512GB SSD
Score:  0.2760595381259918

Top result:  Microsoft Surface Pro 9 | Intel i5, 8GB RAM, 256GB SSD
Score:  0.2645168900489807


In [12]:
# Run the function
search()

Enter Query:
JBL Charge 5

Top result:  JBL Charge 5 Portable Bluetooth Speaker | Squad
Score:  0.7362818717956543

Top result:  Fitbit Charge 6 | Midnight Blue
Score:  0.4355762302875519

Top result:  KitchenAid Stand Mixer | 5-Quart, Empire Red
Score:  0.2378184199333191

Top result:  Bose QuietComfort 45 Headphones | Triple Black
Score:  0.23136910796165466

Top result:  Amazon Kindle Paperwhite | 32GB, Waterproof
Score:  0.20794913172721863


In [13]:
# Run the function
search()

Enter Query:
lg

Top result:  LG OLED65C1PUB 65" 4K Smart TV
Score:  0.4153483808040619

Top result:  Panasonic Lumix GH5 Camera | 20.3 MP, 4K Video
Score:  0.30891314148902893

Top result:  Fitbit Charge 6 | Midnight Blue
Score:  0.2598719000816345

Top result:  Amazon Kindle Paperwhite | 32GB, Waterproof
Score:  0.23383978009223938

Top result:  Apple iPhone 15 (256GB) | Silver
Score:  0.21892668306827545
