In [1]:
import pandas as pd
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, util
import math

from models import models, ModelName

# Load the dataset
# Modify the path to point to your csv file

model_path = ModelName.MULTILINGUAL_MINILM.value
csv_file = 'fine_tuning_dataset/all_merged_dataset.csv'
df = pd.read_csv(csv_file)

# Create InputExamples
train_examples = [InputExample(texts=[row['question'], row['context']]) for _, row in df.iterrows()]

# Create a DataLoader
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

# Load the pre-trained model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Use MultipleNegativesRankingLoss for training
train_loss = losses.MultipleNegativesRankingLoss(model)

# Fine-tune the model
num_epochs = 6
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)
model_name = ""
if num_epochs == 1:
  model_name = ModelName.MULTILINGUAL_E5_SMALL_FINETUNING_1.value
elif num_epochs == 2:
  model_name = ModelName.MULTILINGUAL_E5_SMALL_FINETUNING_2.value
elif num_epochs == 3:
  model_name = ModelName.MULTILINGUAL_E5_SMALL_FINETUNING_3.value
elif num_epochs == 4:
  model_name = ModelName.MULTILINGUAL_E5_SMALL_FINETUNING_4.value
elif num_epochs == 5:
  model_name = ModelName.MULTILINGUAL_E5_SMALL_FINETUNING_5.value
elif num_epochs == 6:
  model_name = ModelName.MULTILINGUAL_E5_SMALL_FINETUNING_6.value
else:
  raise

output_path = models[model_name]['local_dir']
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=output_path)

print(f"Model fine-tuning complete. Model saved to {output_path}.")

  from tqdm.autonotebook import tqdm, trange
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
100%|██████████| 264/264 [01:21<00:00,  3.26it/s]


{'train_runtime': 81.0604, 'train_samples_per_second': 52.109, 'train_steps_per_second': 3.257, 'train_loss': 0.16358395778771603, 'epoch': 6.0}


                                                                     

Model fine-tuning complete. Model saved to ./app/model/modules/multilingual-e5-small-finetuning-6.




In [2]:
# Example evaluation - you may need to adapt according to your dataset
evaluation_examples = [
    ("How are you?", "How do you do?"),
    ("What is your name?", "What's your name?"),
    ("Where do you live?", "Where is your home located?")
]

model = SentenceTransformer(output_path)

for pair in evaluation_examples:
    embeddings = model.encode(pair)
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    print(f"Similarity between: '{pair[0]}' and '{pair[1]}' is {similarity.item():.4f}")

Similarity between: 'How are you?' and 'How do you do?' is 0.4741
Similarity between: 'What is your name?' and 'What's your name?' is 0.9350
Similarity between: 'Where do you live?' and 'Where is your home located?' is 0.7845
