# Matryoshka finetuning

**Model: `microsoft/mpnet-base`**

**Purpose: Text (cosine) similarity with Matryoshka embeddings**

In [1]:
%run ../supportvectors-common.ipynb


<div style="color:#aaa;font-size:8pt">
<hr/>
&copy; SupportVectors. All rights reserved. <blockquote>This notebook is the intellectual property of SupportVectors, and part of its training material. 
Only the participants in SupportVectors workshops are allowed to study the notebooks for educational purposes currently, but is prohibited from copying or using it for any other purposes without written permission.

<b> These notebooks are chapters and sections from Asif Qamar's textbook that he is writing on Data Science. So we request you to not circulate the material to others.</b>
 </blockquote>
 <hr/>
</div>



In [2]:
import torch
from sentence_transformers import SentenceTransformer
from svlearn.config.configuration import ConfigurationMixin
from svlearn.encoder_models.sbert_subjects_full_ft import convert_to_pair_dataset, sampled_dataset, get_evaluator
from svlearn.util.hf_text_util import get_train_test_lists, tuples_list_to_dataset

2024-11-03 08:26:30.766641: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-03 08:26:30.787493: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-03 08:26:30.794367: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-03 08:26:30.809812: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Get the CommentedMap of config (contains paths for data and results directories)
config = ConfigurationMixin().load_config()



/home/chandar/fine-tuning/docs/notebooks/encoder_models


## Load the Model and Dataset from HuggingFace

In [4]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Get the base sentence transformer model
model_name = "microsoft/mpnet-base"
model = SentenceTransformer(model_name).to(device)

Some weights of MPNetModel were not initialized from the model checkpoint at microsoft/mpnet-base and are newly initialized: ['mpnet.pooler.dense.bias', 'mpnet.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:

sentence1 = "The rate of change of displacement is velocity"
sentence2 = "Kidney plays an important role in purifying blood"
sentence3 = "Many countries obtained their freedom by 1950"
sentence4 = "Force is proportional to mass"
sentence5 = "Vaccines train our immune system to create antibodies"
sentence6 = "World war 2 was a global conflict between two coalitions - the allies and the axis powers"

sentences = [sentence1, sentence4, sentence2, sentence5, sentence3, sentence6]


In [6]:
embeddings = model.encode(sentences)
similarities = model.similarity(embeddings, embeddings)
print(similarities)


tensor([[1.0000, 0.9100, 0.7976, 0.8212, 0.8668, 0.7800],
        [0.9100, 1.0000, 0.8350, 0.8526, 0.8465, 0.7850],
        [0.7976, 0.8350, 1.0000, 0.9248, 0.8343, 0.8791],
        [0.8212, 0.8526, 0.9248, 1.0000, 0.8663, 0.8532],
        [0.8668, 0.8465, 0.8343, 0.8663, 1.0000, 0.8532],
        [0.7800, 0.7850, 0.8791, 0.8532, 0.8532, 1.0000]])


Create the evaluator to evaluate the model before and after training

In [7]:
# pick chunks labeled with subjects (biology, physics, history assigned to labels 0, 1, 2 respectively)
_, test = get_train_test_lists(cfg=config)

# Convert to Dataset format
test_dataset = tuples_list_to_dataset(test)

# Sample to max of 500 per label so that the paired dataset is having max of 1500*1499/2
test_dataset = sampled_dataset(test_dataset)

# Create the paired dataset consisting of (sentence1, sentence2, score) from the text/label dataset
test_dataset = convert_to_pair_dataset(test_dataset)

binary_acc_evaluator = get_evaluator(test_dataset=test_dataset)

Filter:   0%|          | 0/9333 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9333 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9333 [00:00<?, ? examples/s]

Evaluate model before training

In [8]:

results = binary_acc_evaluator(model)
results

{'cosine_accuracy': 0.7608,
 'cosine_accuracy_threshold': 0.9584275484085083,
 'cosine_f1': 0.5781433607520565,
 'cosine_f1_threshold': 0.9403972029685974,
 'cosine_precision': 0.5594087549744173,
 'cosine_recall': 0.5981762917933131,
 'cosine_ap': 0.651767715812509,
 'dot_accuracy': 0.6998,
 'dot_accuracy_threshold': 20.22665023803711,
 'dot_f1': 0.4952495852812547,
 'dot_f1_threshold': 10.787991523742676,
 'dot_precision': 0.3293221018852788,
 'dot_recall': 0.998176291793313,
 'dot_ap': 0.4495384496125088,
 'manhattan_accuracy': 0.7828,
 'manhattan_accuracy_threshold': 24.910701751708984,
 'manhattan_f1': 0.6304147465437788,
 'manhattan_f1_threshold': 27.439680099487305,
 'manhattan_precision': 0.6372670807453417,
 'manhattan_recall': 0.6237082066869301,
 'manhattan_ap': 0.7029136962512184,
 'euclidean_accuracy': 0.7614,
 'euclidean_accuracy_threshold': 1.2952733039855957,
 'euclidean_f1': 0.5777409555371444,
 'euclidean_f1_threshold': 1.596674919128418,
 'euclidean_precision': 0.529

## Load the fine-tuned model and repeat with the fully trained model first

In [9]:
config = ConfigurationMixin().load_config()
results_dir = config["paths"]["results"]
# Update Checkpoint Folder
finetuned_model_dir = f'{results_dir}/subject-based-encoder-matryoshka/checkpoint-2000'   
# Load the model
model = SentenceTransformer(finetuned_model_dir).to(device)

embeddings = model.encode(sentences)
similarities = model.similarity(embeddings, embeddings)
print(similarities)



/home/chandar/fine-tuning/docs/notebooks/encoder_models
tensor([[ 1.0000,  0.9197, -0.1257, -0.1462, -0.1447, -0.1692],
        [ 0.9197,  1.0000, -0.0565, -0.0853, -0.1198, -0.1441],
        [-0.1257, -0.0565,  1.0000,  0.9570, -0.0074, -0.1261],
        [-0.1462, -0.0853,  0.9570,  1.0000, -0.0310, -0.1589],
        [-0.1447, -0.1198, -0.0074, -0.0310,  1.0000,  0.9057],
        [-0.1692, -0.1441, -0.1261, -0.1589,  0.9057,  1.0000]])


Evaluate model after training

In [10]:

results = binary_acc_evaluator(model)
results

{'cosine_accuracy': 0.9856,
 'cosine_accuracy_threshold': 0.33601704239845276,
 'cosine_f1': 0.9781553398058254,
 'cosine_f1_threshold': 0.33601704239845276,
 'cosine_precision': 0.9763779527559056,
 'cosine_recall': 0.9799392097264438,
 'cosine_ap': 0.9944484131851321,
 'dot_accuracy': 0.9856,
 'dot_accuracy_threshold': 4.369484901428223,
 'dot_f1': 0.9781553398058254,
 'dot_f1_threshold': 4.369484901428223,
 'dot_precision': 0.9763779527559056,
 'dot_recall': 0.9799392097264438,
 'dot_ap': 0.9950381531554066,
 'manhattan_accuracy': 0.9852,
 'manhattan_accuracy_threshold': 93.51741790771484,
 'manhattan_f1': 0.9775893397940643,
 'manhattan_f1_threshold': 95.82697296142578,
 'manhattan_precision': 0.9740494870247435,
 'manhattan_recall': 0.9811550151975684,
 'manhattan_ap': 0.9963936457799082,
 'euclidean_accuracy': 0.9854,
 'euclidean_accuracy_threshold': 4.452507495880127,
 'euclidean_f1': 0.9778720824492271,
 'euclidean_f1_threshold': 4.452507495880127,
 'euclidean_precision': 0.975

Redo with reduced dim of 64

In [11]:
model = SentenceTransformer(finetuned_model_dir, truncate_dim=64).to(device)
embeddings = model.encode(sentences)
print(embeddings.shape)
similarities = model.similarity(embeddings, embeddings)
print(similarities)

(6, 64)
tensor([[ 1.0000,  0.9617, -0.1700, -0.2219, -0.0831, -0.1428],
        [ 0.9617,  1.0000, -0.0264, -0.0670, -0.0513, -0.1217],
        [-0.1700, -0.0264,  1.0000,  0.9829, -0.2261, -0.2781],
        [-0.2219, -0.0670,  0.9829,  1.0000, -0.2078, -0.2560],
        [-0.0831, -0.0513, -0.2261, -0.2078,  1.0000,  0.9562],
        [-0.1428, -0.1217, -0.2781, -0.2560,  0.9562,  1.0000]])


In [12]:
results = binary_acc_evaluator(model)
results

{'cosine_accuracy': 0.9858,
 'cosine_accuracy_threshold': 0.45322659611701965,
 'cosine_f1': 0.978399756616976,
 'cosine_f1_threshold': 0.45322659611701965,
 'cosine_precision': 0.9792935444579781,
 'cosine_recall': 0.9775075987841946,
 'cosine_ap': 0.9934115326124136,
 'dot_accuracy': 0.9858,
 'dot_accuracy_threshold': 0.8417322635650635,
 'dot_f1': 0.9784260103312064,
 'dot_f1_threshold': 0.7999638915061951,
 'dot_precision': 0.9781287970838396,
 'dot_recall': 0.9787234042553191,
 'dot_ap': 0.9915131499725633,
 'manhattan_accuracy': 0.9856,
 'manhattan_accuracy_threshold': 10.817245483398438,
 'manhattan_f1': 0.9782082324455206,
 'manhattan_f1_threshold': 10.817245483398438,
 'manhattan_precision': 0.974080771549126,
 'manhattan_recall': 0.982370820668693,
 'manhattan_ap': 0.993687431696979,
 'euclidean_accuracy': 0.9858,
 'euclidean_accuracy_threshold': 1.3337856531143188,
 'euclidean_f1': 0.9783470570295821,
 'euclidean_f1_threshold': 1.3702194690704346,
 'euclidean_precision': 0.9