In [1]:
! pip install -Uq torch tensorboard sentence-transformers datasets transformers

In [2]:
import wandb
wandb.init(mode="disabled")

#### **Create and Prepare embedding dataset**

In [3]:
from datasets import load_dataset

dataset = load_dataset("philschmid/finanical-rag-embedding-dataset", split="train")
dataset

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


Dataset({
    features: ['question', 'context'],
    num_rows: 7000
})

In [4]:
# rename columns
dataset = dataset.rename_column("question", "anchor")
dataset = dataset.rename_column("context", "positive")

In [5]:
# Add an id column to the dataset
dataset = dataset.add_column("id", range(len(dataset)))
dataset

Dataset({
    features: ['anchor', 'positive', 'id'],
    num_rows: 7000
})

In [6]:
# Split dataset into a 10% test set
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['anchor', 'positive', 'id'],
        num_rows: 6300
    })
    test: Dataset({
        features: ['anchor', 'positive', 'id'],
        num_rows: 700
    })
})

In [7]:
# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records")
dataset["test"].to_json("test_dataset.json", orient="records")

Creating json from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

249580

#### **Create baseline and evaluate pretrained model**

In [8]:
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator, SequentialEvaluator
from sentence_transformers.util import cos_sim
from datasets import load_dataset, concatenate_datasets

model_id = "BAAI/bge-small-en-v1.5"
matryoshka_dimensions = [384, 256, 128, 64]

# Load a model
model = SentenceTransformer(
    model_id,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# load test dataset
test_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")
corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

corpus_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['anchor', 'positive', 'id'],
    num_rows: 7000
})

In [9]:
# Convert the datasets to dictionaries
corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
) # Our corpus (cid => document)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
) # Our queries (qid => question)

In [10]:
# Create a mapping of relevant document (1 in our case) for each query
relevant_docs = {}
for q_id in queries:
  relevant_docs[q_id] = [q_id]

In [11]:
matryoshka_evaluators = []
# Iterate over the different dimensions
for dim in matryoshka_dimensions:
  ir_evaluator = InformationRetrievalEvaluator(
      queries=queries,
      corpus=corpus,
      relevant_docs=relevant_docs,
      name=f"dim_{dim}",
      truncate_dim=dim,
      score_functions={"cosine": cos_sim}
  )
  matryoshka_evaluators.append(ir_evaluator)

# Create a sequential evaluator
evaluator = SequentialEvaluator(matryoshka_evaluators)

In [12]:
# Evaluate the model
results = evaluator(model)

for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_ndcg@10"
  print(f"{key}: {results[key]}")

dim_384_cosine_ndcg@10: 0.7313810593445129
dim_256_cosine_ndcg@10: 0.7206317143378926
dim_128_cosine_ndcg@10: 0.6851950767822297
dim_64_cosine_ndcg@10: 0.6216422018474896


In [13]:
# print the main score
for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_recall@3"
  print(f"{key}: {results[key]}")

dim_384_cosine_recall@3: 0.75
dim_256_cosine_recall@3: 0.7357142857142858
dim_128_cosine_recall@3: 0.7057142857142857
dim_64_cosine_recall@3: 0.6485714285714286


#### **Define loss function with Matryoshka Representation**

In [14]:
from sentence_transformers import SentenceTransformer, SentenceTransformerModelCardData

model_id = "BAAI/bge-small-en-v1.5"

model = SentenceTransformer(
    model_id,
    model_kwargs={"attn_implementation": "sdpa"},
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="BGE base Financial Matryoshka"
    )
)

In [15]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [384, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

#### **Fine-tune embedding model with** `SentenceTransformersTrainer`

In [16]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")

args = SentenceTransformerTrainingArguments(
    output_dir="bge-small-financial-matryoshka",
    num_train_epochs=4,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,
    learning_rate=2e-5,
    lr_scheduler_type="cosine",
    optim="adamw_torch_fused",
    fp16=True,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to=None,
    logging_steps=10,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_dim_128_cosine_ndcg@10",
)

In [17]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model, # bge-base-en-v1
    args=args, # training arguments
    train_dataset=train_dataset.select_columns(
        ["anchor", "positive"]
    ), # training dataset
    loss=train_loss,
    evaluator=evaluator
)

In [18]:
trainer.train()



Epoch,Training Loss,Validation Loss,Dim 384 Cosine Accuracy@1,Dim 384 Cosine Accuracy@3,Dim 384 Cosine Accuracy@5,Dim 384 Cosine Accuracy@10,Dim 384 Cosine Precision@1,Dim 384 Cosine Precision@3,Dim 384 Cosine Precision@5,Dim 384 Cosine Precision@10,Dim 384 Cosine Recall@1,Dim 384 Cosine Recall@3,Dim 384 Cosine Recall@5,Dim 384 Cosine Recall@10,Dim 384 Cosine Ndcg@10,Dim 384 Cosine Mrr@10,Dim 384 Cosine Map@100,Dim 256 Cosine Accuracy@1,Dim 256 Cosine Accuracy@3,Dim 256 Cosine Accuracy@5,Dim 256 Cosine Accuracy@10,Dim 256 Cosine Precision@1,Dim 256 Cosine Precision@3,Dim 256 Cosine Precision@5,Dim 256 Cosine Precision@10,Dim 256 Cosine Recall@1,Dim 256 Cosine Recall@3,Dim 256 Cosine Recall@5,Dim 256 Cosine Recall@10,Dim 256 Cosine Ndcg@10,Dim 256 Cosine Mrr@10,Dim 256 Cosine Map@100,Dim 128 Cosine Accuracy@1,Dim 128 Cosine Accuracy@3,Dim 128 Cosine Accuracy@5,Dim 128 Cosine Accuracy@10,Dim 128 Cosine Precision@1,Dim 128 Cosine Precision@3,Dim 128 Cosine Precision@5,Dim 128 Cosine Precision@10,Dim 128 Cosine Recall@1,Dim 128 Cosine Recall@3,Dim 128 Cosine Recall@5,Dim 128 Cosine Recall@10,Dim 128 Cosine Ndcg@10,Dim 128 Cosine Mrr@10,Dim 128 Cosine Map@100,Dim 64 Cosine Accuracy@1,Dim 64 Cosine Accuracy@3,Dim 64 Cosine Accuracy@5,Dim 64 Cosine Accuracy@10,Dim 64 Cosine Precision@1,Dim 64 Cosine Precision@3,Dim 64 Cosine Precision@5,Dim 64 Cosine Precision@10,Dim 64 Cosine Recall@1,Dim 64 Cosine Recall@3,Dim 64 Cosine Recall@5,Dim 64 Cosine Recall@10,Dim 64 Cosine Ndcg@10,Dim 64 Cosine Mrr@10,Dim 64 Cosine Map@100,Sequential Score
1,0.5732,No log,0.7,0.83,0.87,0.912857,0.7,0.276667,0.174,0.091286,0.7,0.83,0.87,0.912857,0.807403,0.773543,0.777283,0.695714,0.834286,0.865714,0.914286,0.695714,0.278095,0.173143,0.091429,0.695714,0.834286,0.865714,0.914286,0.805622,0.770772,0.77421,0.668571,0.817143,0.865714,0.91,0.668571,0.272381,0.173143,0.091,0.668571,0.817143,0.865714,0.91,0.791904,0.753863,0.756971,0.65,0.79,0.835714,0.882857,0.65,0.263333,0.167143,0.088286,0.65,0.79,0.835714,0.882857,0.767615,0.730626,0.735213,0.767615
2,0.3479,No log,0.708571,0.834286,0.878571,0.928571,0.708571,0.278095,0.175714,0.092857,0.708571,0.834286,0.878571,0.928571,0.81849,0.783389,0.786131,0.707143,0.84,0.877143,0.924286,0.707143,0.28,0.175429,0.092429,0.707143,0.84,0.877143,0.924286,0.816512,0.781956,0.784855,0.685714,0.824286,0.874286,0.914286,0.685714,0.274762,0.174857,0.091429,0.685714,0.824286,0.874286,0.914286,0.802437,0.766405,0.769516,0.667143,0.795714,0.837143,0.892857,0.667143,0.265238,0.167429,0.089286,0.667143,0.795714,0.837143,0.892857,0.780259,0.744257,0.748522,0.780259
3,0.223,No log,0.714286,0.84,0.88,0.924286,0.714286,0.28,0.176,0.092429,0.714286,0.84,0.88,0.924286,0.81992,0.786435,0.789663,0.708571,0.847143,0.88,0.928571,0.708571,0.282381,0.176,0.092857,0.708571,0.847143,0.88,0.928571,0.818789,0.783645,0.786313,0.695714,0.83,0.875714,0.915714,0.695714,0.276667,0.175143,0.091571,0.695714,0.83,0.875714,0.915714,0.80657,0.771575,0.774683,0.674286,0.798571,0.85,0.902857,0.674286,0.26619,0.17,0.090286,0.674286,0.798571,0.85,0.902857,0.786572,0.749632,0.75326,0.786572


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=196, training_loss=0.5800437744782896, metrics={'train_runtime': 190.4803, 'train_samples_per_second': 132.297, 'train_steps_per_second': 1.029, 'total_flos': 0.0, 'train_loss': 0.5800437744782896, 'epoch': 3.934010152284264})

In [19]:
# save the best model
trainer.save_model()

In [20]:
# # push model to hub
# trainer.model.push_to_hub("bge-base-financial-matryoshka")

#### **Evaluate fine-tuned model against baseline**

In [21]:
from sentence_transformers import SentenceTransformer

fine_tuned_model = SentenceTransformer(
    args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
)

# Evaluate the model
results = evaluator(fine_tuned_model)

# print the main score
for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_ndcg@10"
  print(f"{key}: {results[key]}")

dim_384_cosine_ndcg@10: 0.8201867514286008
dim_256_cosine_ndcg@10: 0.8186903238351805
dim_128_cosine_ndcg@10: 0.8067524207279965
dim_64_cosine_ndcg@10: 0.7863462473274174


In [22]:
# print the main score
for dim in matryoshka_dimensions:
  key = f"dim_{dim}_cosine_recall@3"
  print(f"{key}: {results[key]}")

dim_384_cosine_recall@3: 0.8371428571428572
dim_256_cosine_recall@3: 0.8457142857142858
dim_128_cosine_recall@3: 0.8314285714285714
dim_64_cosine_recall@3: 0.7985714285714286


In [23]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)