In [1]:
! pip install   datasets  sentence_transformers openpyxl --q
! pip install   transformers[torch] --q
! pip install   accelerate==0.27.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import uuid

from huggingface_hub import login
from google.colab import userdata

from datasets import Dataset, load_dataset, DatasetDict, concatenate_datasets
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import (
    InformationRetrievalEvaluator,
    SequentialEvaluator,
)
from sentence_transformers.util import cos_sim


login(token=userdata.get('hf_token'))
os.environ["OPENAI_API_KEY"] =  userdata.get('openai_key')
os.environ["WANDB_API_KEY"]  =  userdata.get('wand_key')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Read Dataset

In [6]:
df_train = pd.read_csv('/content/drive/MyDrive/dataset/train_triplet_dataset.csv')
df_test  = pd.read_csv('/content/drive/MyDrive/dataset/test_triplet_dataset.csv')

dataset_train = Dataset.from_pandas(df_train)
dataset_test  = Dataset.from_pandas(df_test)

dataset_train = dataset_train.add_column("id", [str(uuid.uuid4()) for _ in range(len(dataset_train))])
dataset_test  = dataset_test.add_column("id", [str(uuid.uuid4()) for _ in range(len(dataset_test))])

# save datasets to disk
dataset_train.to_json("train_dataset.json", orient="records")
dataset_test.to_json("test_dataset.json", orient="records")

Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1574822

# ***Create baseline and evaluate pretrained model***

In [7]:

# model_id = "BAAI/bge-base-en-v1.5"  # Hugging Face model ID
# model = SentenceTransformer(
#     model_id, device="cuda" if torch.cuda.is_available() else "cpu"
# )
test_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")

corpus_dataset = concatenate_datasets([train_dataset, test_dataset])

corpus = dict(
    zip(corpus_dataset["id"], corpus_dataset["positive"])
)
queries = dict(
    zip(test_dataset["id"], test_dataset["anchor"])
)

relevant_docs = {}
for q_id in queries:
    relevant_docs[q_id] = [q_id]


ir_evaluator = InformationRetrievalEvaluator(
        queries=queries,
        corpus=corpus,
        relevant_docs=relevant_docs,
        show_progress_bar=True,
        accuracy_at_k=[1, 3, 5, 7, 10],
        precision_recall_at_k=[1, 3, 5,7,10],
        map_at_k=[100],
        mrr_at_k=[10],
        ndcg_at_k=[10],
        name=f"ir_evaluator",
        score_functions={"cosine": cos_sim}
    )
df = pd.DataFrame(corpus_dataset)
df.to_excel('/content/drive/MyDrive/dataset/corpus.xlsx', index=False)


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:

import json
def evaluate_model(model_id_or_path, is_pretrained=True):

    try:
        model = SentenceTransformer(model_id_or_path, device="cuda" if torch.cuda.is_available() else "cpu")
    except Exception as e:
        print(f"Error loading the model: {e}")
        return

    try:
      results = ir_evaluator(model)
    except Exception as e:
      print(f"Error during evaluation: {e}")
      return

    results_filepath = f"/content/drive/MyDrive/dataset/evaluation_results_{'pretrained' if is_pretrained else 'finetuned'}.txt"
    with open(results_filepath, "w") as f:
      json.dump(results, f)

    print(f"Evaluation results saved to: {results_filepath}")

    for metric, value in results.items():
      print(f"{metric}: {value:.4f}")



In [16]:
evaluate_model("BAAI/bge-base-en-v1.5",True)

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:09<00:00,  9.59s/it]

Evaluation results saved to: /content/drive/MyDrive/dataset/evaluation_results_pretrained.txt
ir_evaluator_cosine_accuracy@1: 0.2605
ir_evaluator_cosine_accuracy@3: 0.3787
ir_evaluator_cosine_accuracy@5: 0.4456
ir_evaluator_cosine_accuracy@7: 0.4854
ir_evaluator_cosine_accuracy@10: 0.5262
ir_evaluator_cosine_precision@1: 0.2605
ir_evaluator_cosine_precision@3: 0.1262
ir_evaluator_cosine_precision@5: 0.0891
ir_evaluator_cosine_precision@7: 0.0693
ir_evaluator_cosine_precision@10: 0.0526
ir_evaluator_cosine_recall@1: 0.2605
ir_evaluator_cosine_recall@3: 0.3787
ir_evaluator_cosine_recall@5: 0.4456
ir_evaluator_cosine_recall@7: 0.4854
ir_evaluator_cosine_recall@10: 0.5262
ir_evaluator_cosine_ndcg@10: 0.3829
ir_evaluator_cosine_mrr@10: 0.3383
ir_evaluator_cosine_map@100: 0.3481





# **Intialize model and loss function**

In [15]:
from sentence_transformers import SentenceTransformerModelCardData, SentenceTransformer

# Hugging Face model ID: https://huggingface.co/BAAI/bge-base-en-v1.5
model_id = "BAAI/bge-base-en-v1.5"
device = "cuda" if torch.cuda.is_available() else "cpu"
# load model with SDPA for using Flash Attention 2
model = SentenceTransformer(
    model_id,
    device=device,
    model_kwargs={"attn_implementation": "sdpa"},
    model_card_data=SentenceTransformerModelCardData(
        language="en",
        license="apache-2.0",
        model_name="BGE base model",
    ),
)
print(f"Using device: {device}")

Using device: cuda


#*Initialise Loss Function*

In [13]:
from sentence_transformers.losses import  MultipleNegativesRankingLoss,TripletLoss,ContrastiveLoss

train_loss = TripletLoss(model)


# *Fine Tune embedding model*

In [29]:
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.training_args import BatchSamplers

# load train dataset again
train_dataset = load_dataset("json", data_files="train_dataset.json", split="train")

# define training arguments
args = SentenceTransformerTrainingArguments(
    output_dir="models/bge-large-triplet-v1.5", # output directory and hugging face model ID
    num_train_epochs=30,                         # number of epochs
    per_device_train_batch_size=32,             # train batch size
    gradient_accumulation_steps=16,             # for a global batch size of 512
    per_device_eval_batch_size=16,              # evaluation batch size
    warmup_ratio=0.1,                           # warmup ratio
    learning_rate=2e-5,                         # learning rate, 2e-5 is a good value
    lr_scheduler_type="cosine",               # use constant learning rate scheduler
    optim="adamw_torch_fused",                  # use fused adamw optimizer
    tf32=True,                                  # use tf32 precision
    bf16=True,                                  # use bf16 precision
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    eval_strategy="epoch",                      # evaluate after each epoch
    save_strategy="epoch",                      # save after each epoch
    logging_steps=10,                           # log every 10 steps
    save_total_limit=3,                         # save only the last 3 models
    load_best_model_at_end=True,                # load the best model when training ends
    run_name="bge-base-en-v1.5-finetuned_v2.1",
    metric_for_best_model="eval_ir_evaluator_cosine_ndcg@10"
)

In [30]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model, # bg-base-en-v1
    args=args,  # training arguments
    train_dataset=train_dataset.select_columns(
        ["positive", "anchor", "negative"]
    ),
    loss=train_loss,
    evaluator=ir_evaluator,
)

In [31]:
trainer.train()

dataset = dataset.select_columns(['anchor', 'positive', 'negative'])


Epoch,Training Loss,Validation Loss,Ir Evaluator Cosine Accuracy@1,Ir Evaluator Cosine Accuracy@3,Ir Evaluator Cosine Accuracy@5,Ir Evaluator Cosine Accuracy@7,Ir Evaluator Cosine Accuracy@10,Ir Evaluator Cosine Precision@1,Ir Evaluator Cosine Precision@3,Ir Evaluator Cosine Precision@5,Ir Evaluator Cosine Precision@7,Ir Evaluator Cosine Precision@10,Ir Evaluator Cosine Recall@1,Ir Evaluator Cosine Recall@3,Ir Evaluator Cosine Recall@5,Ir Evaluator Cosine Recall@7,Ir Evaluator Cosine Recall@10,Ir Evaluator Cosine Ndcg@10,Ir Evaluator Cosine Mrr@10,Ir Evaluator Cosine Map@100
0,No log,No log,0.3159,0.433054,0.493724,0.529289,0.57113,0.3159,0.144351,0.098745,0.075613,0.057113,0.3159,0.433054,0.493724,0.529289,0.57113,0.434168,0.391477,0.40118
1,4.828000,No log,0.294979,0.393305,0.465481,0.51046,0.547071,0.294979,0.131102,0.093096,0.072923,0.054707,0.294979,0.393305,0.465481,0.51046,0.547071,0.408963,0.366166,0.376252
2,4.746300,No log,0.222803,0.330544,0.373431,0.42364,0.462343,0.222803,0.110181,0.074686,0.06052,0.046234,0.222803,0.330544,0.373431,0.42364,0.462343,0.331161,0.29059,0.301477
3,4.746300,No log,0.218619,0.311715,0.373431,0.408996,0.451883,0.218619,0.103905,0.074686,0.058428,0.045188,0.218619,0.311715,0.373431,0.408996,0.451883,0.324341,0.284916,0.296105
4,4.680800,No log,0.226987,0.330544,0.384937,0.4341,0.472803,0.226987,0.110181,0.076987,0.062014,0.04728,0.226987,0.330544,0.384937,0.4341,0.472803,0.337769,0.295991,0.307759
5,4.575800,No log,0.224895,0.33682,0.396444,0.435146,0.471757,0.224895,0.112273,0.079289,0.062164,0.047176,0.224895,0.33682,0.396444,0.435146,0.471757,0.338301,0.296722,0.30889
6,4.575800,No log,0.216527,0.324268,0.378661,0.426778,0.467573,0.216527,0.108089,0.075732,0.060968,0.046757,0.216527,0.324268,0.378661,0.426778,0.467573,0.330971,0.288605,0.299427
7,4.529900,No log,0.208159,0.312762,0.364017,0.403766,0.442469,0.208159,0.104254,0.072803,0.057681,0.044247,0.208159,0.312762,0.364017,0.403766,0.442469,0.314984,0.275436,0.285944
8,4.444000,No log,0.189331,0.286611,0.333682,0.383891,0.42364,0.189331,0.095537,0.066736,0.054842,0.042364,0.189331,0.286611,0.333682,0.383891,0.42364,0.294582,0.25478,0.264688
9,4.377200,No log,0.187238,0.275105,0.312762,0.354603,0.395397,0.187238,0.091702,0.062552,0.050658,0.03954,0.187238,0.275105,0.312762,0.354603,0.395397,0.280318,0.244934,0.25529


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.69s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.60s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.57s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.57s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.65s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.74s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.77s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.71s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.72s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.78s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.70s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.76s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.70s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.57s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.68s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.55s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.58s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.76s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.67s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.69s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.74s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.74s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.82s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.75s/it]


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.77s/it]


TrainOutput(global_step=210, training_loss=4.322369075956798, metrics={'train_runtime': 1041.3407, 'train_samples_per_second': 110.194, 'train_steps_per_second': 0.202, 'total_flos': 0.0, 'train_loss': 4.322369075956798, 'epoch': 29.933333333333334})

In [32]:
trainer.save_model()
trainer.model.push_to_hub("bge-base-en-v1.5-finetuned_v2.1")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

'https://huggingface.co/sandeep-aggarwal/bge-base-en-v1.5-finetuned_v2.1/commit/891852bf29554007868bad85b954192bd993280e'

#*Run Post Fine Tuning Evluation*

In [17]:

# fine_tuned_model = SentenceTransformer(
#     args.output_dir, device="cuda" if torch.cuda.is_available() else "cpu"
# )
# Evaluate the model

evaluate_model("sandeep-aggarwal/bge-base-en-v1.5-finetuned_v2.1",False)


Batches:   0%|          | 0/30 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:09<00:00,  9.59s/it]

Evaluation results saved to: /content/drive/MyDrive/dataset/evaluation_results_finetuned.txt
ir_evaluator_cosine_accuracy@1: 0.2824
ir_evaluator_cosine_accuracy@3: 0.4100
ir_evaluator_cosine_accuracy@5: 0.4770
ir_evaluator_cosine_accuracy@7: 0.5115
ir_evaluator_cosine_accuracy@10: 0.5481
ir_evaluator_cosine_precision@1: 0.2824
ir_evaluator_cosine_precision@3: 0.1367
ir_evaluator_cosine_precision@5: 0.0954
ir_evaluator_cosine_precision@7: 0.0731
ir_evaluator_cosine_precision@10: 0.0548
ir_evaluator_cosine_recall@1: 0.2824
ir_evaluator_cosine_recall@3: 0.4100
ir_evaluator_cosine_recall@5: 0.4770
ir_evaluator_cosine_recall@7: 0.5115
ir_evaluator_cosine_recall@10: 0.5481
ir_evaluator_cosine_ndcg@10: 0.4084
ir_evaluator_cosine_mrr@10: 0.3644
ir_evaluator_cosine_map@100: 0.3743





In [8]:
from IPython.display import display
import json
import pandas as pd

with open('/content/drive/MyDrive/dataset/evaluation_results_pretrained.txt', 'r') as file:
    pretrained_metrics = json.load(file)

with open('/content/drive/MyDrive/dataset/evaluation_results_finetuned.txt', 'r') as file:
    finetuned_metrics = json.load(file)

df_pretrained = pd.DataFrame(list(pretrained_metrics.items()), columns=["Metric", "Pre-trained"])
df_finetuned = pd.DataFrame(list(finetuned_metrics.items()), columns=["Metric", "Fine-tuned"])
df_comparison = pd.merge(df_pretrained, df_finetuned, on="Metric")
df_comparison["%Improvement"] = ((df_comparison["Fine-tuned"] - df_comparison["Pre-trained"]) / df_comparison["Pre-trained"]) * 100
# Display the comparison table
display(df_comparison)

Unnamed: 0,Metric,Pre-trained,Fine-tuned,%Improvement
0,ir_evaluator_cosine_accuracy@1,0.26046,0.282427,8.433735
1,ir_evaluator_cosine_accuracy@3,0.378661,0.410042,8.287293
2,ir_evaluator_cosine_accuracy@5,0.445607,0.476987,7.042254
3,ir_evaluator_cosine_accuracy@7,0.485356,0.511506,5.387931
4,ir_evaluator_cosine_accuracy@10,0.526151,0.548117,4.17495
5,ir_evaluator_cosine_precision@1,0.26046,0.282427,8.433735
6,ir_evaluator_cosine_precision@3,0.12622,0.136681,8.287293
7,ir_evaluator_cosine_precision@5,0.089121,0.095397,7.042254
8,ir_evaluator_cosine_precision@7,0.069337,0.073072,5.387931
9,ir_evaluator_cosine_precision@10,0.052615,0.054812,4.17495
