In [1]:
from sentence_transformers import SentenceTransformer, util, InputExample, losses
from torch.utils.data import DataLoader
import torch
import os, datetime

  from .autonotebook import tqdm as notebook_tqdm


Define Sentences & Pairs

In [2]:
SENTENCES = [
    "Book a flight from San Francisco to New York next Monday.",
    "Reserve a plane ticket from SFO to JFK for Monday.",
    "I love making pancakes on Sunday mornings.",
    "Schedule a meeting with the marketing team at 3 PM.",
    "Set up a 3pm marketing team meeting.",
    "The quick brown fox jumps over the lazy dog."
]

In [3]:
PAIRS_TO_CHECK = [
    (0, 1, "paraphrase"),
    (3, 4, "paraphrase"),
    (0, 3, "different tasks"),
    (1, 2, "unrelated"),
    (2, 5, "unrelated")
]


Baseline Embeddings

In [4]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(SENTENCES, convert_to_tensor=True, normalize_embeddings=True)

sims = util.cos_sim(embeddings, embeddings).cpu().numpy()

print("---- MiniLM Cosine Similarities ----")
for i, j, desc in PAIRS_TO_CHECK:
    print(f"{desc:14s}: {sims[i,j]:.3f}")


---- MiniLM Cosine Similarities ----
paraphrase    : 0.663
paraphrase    : 0.894
different tasks: 0.236
unrelated     : 0.131
unrelated     : 0.048


Matryoshka Embeddings (all sizes)

In [None]:
mat_model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code= True)
mat_embeddings = mat_model.encode(SENTENCES, convert_to_tensor=True, normalize_embeddings=True)

dims_to_test = [128, 256, 512, 768]

for d in dims_to_test:
    emb_d = mat_embeddings[:, :d]
    emb_d = emb_d / emb_d.norm(dim=1, keepdim=True)  # re-normalize
    sims_d = util.cos_sim(emb_d, emb_d).cpu().numpy()
    print(f"\n---- Matryoshka similarities at {d}d ----")
    for i, j, desc in PAIRS_TO_CHECK:
        print(f"{desc:14s}: {sims_d[i,j]:.3f}")


Fine-tuning with Matryoshka Losses

In [11]:
positive_pairs = [
    ("Book a flight from San Francisco to New York next Monday.",
     "Reserve a plane ticket from SFO to JFK for Monday."),
    ("Schedule a meeting with the marketing team at 3 PM.",
     "Set up a 3pm marketing team meeting."),
    ("Buy tickets to NYC next Monday from SFO.",
     "Reserve a plane ticket from SFO to JFK for Monday.")
]


In [13]:
train_samples = [InputExample(texts=[a, b]) for a, b in positive_pairs]
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)

base_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Matryoshka-aware fine-tuning
m_dims = [128, 256, 384]
train_loss = losses.MatryoshkaLoss(
    model=base_model,
    loss=losses.MultipleNegativesRankingLoss(base_model),
    matryoshka_dims=m_dims
)

In [14]:
output_dir = os.path.join("output", "finetuned-" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

base_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=2,
    warmup_steps=0,
    optimizer_params={"lr": 2e-5},
    output_path=output_dir,
    show_progress_bar=True,
)

                                                                     

Step,Training Loss


Compare before and after fine-tuning

In [15]:
tuned = SentenceTransformer(output_dir)

emb_before = model.encode(SENTENCES, convert_to_tensor=True, normalize_embeddings=True)
sims_before = util.cos_sim(emb_before, emb_before).cpu().numpy()

emb_after = tuned.encode(SENTENCES, convert_to_tensor=True, normalize_embeddings=True)
sims_after = util.cos_sim(emb_after, emb_after).cpu().numpy()

print("=== Before fine-tuning ===")
for i, j, desc in PAIRS_TO_CHECK:
    print(f"{desc:14s}: {sims_before[i,j]:.3f}")

print("\n=== After fine-tuning (full dim) ===")
for i, j, desc in PAIRS_TO_CHECK:
    print(f"{desc:14s}: {sims_after[i,j]:.3f}")


=== Before fine-tuning ===
paraphrase    : 0.663
paraphrase    : 0.894
different tasks: 0.236
unrelated     : 0.131
unrelated     : 0.048

=== After fine-tuning (full dim) ===
paraphrase    : 0.663
paraphrase    : 0.894
different tasks: 0.236
unrelated     : 0.131
unrelated     : 0.048


Verify Matryoshka behavior after fine-tuning

In [17]:
emb_after_full = tuned.encode(SENTENCES, convert_to_tensor=True, normalize_embeddings=True)

for d in [32, 64, 128, 256, 384]:
    e = emb_after_full[:, :d]
    e = e / e.norm(dim=1, keepdim=True)
    sims_d = util.cos_sim(e, e).cpu().numpy()
    print(f"\n=== After fine-tuning (Matryoshka {d}d) ===")
    for i, j, desc in PAIRS_TO_CHECK:
        print(f"{desc:14s}: {sims_d[i,j]:.3f}")



=== After fine-tuning (Matryoshka 32d) ===
paraphrase    : 0.633
paraphrase    : 0.912
different tasks: 0.005
unrelated     : 0.230
unrelated     : 0.128

=== After fine-tuning (Matryoshka 64d) ===
paraphrase    : 0.690
paraphrase    : 0.916
different tasks: 0.214
unrelated     : 0.102
unrelated     : 0.117

=== After fine-tuning (Matryoshka 128d) ===
paraphrase    : 0.727
paraphrase    : 0.916
different tasks: 0.200
unrelated     : 0.152
unrelated     : 0.074

=== After fine-tuning (Matryoshka 256d) ===
paraphrase    : 0.692
paraphrase    : 0.902
different tasks: 0.237
unrelated     : 0.121
unrelated     : 0.029

=== After fine-tuning (Matryoshka 384d) ===
paraphrase    : 0.663
paraphrase    : 0.894
different tasks: 0.236
unrelated     : 0.131
unrelated     : 0.048
