In [None]:
# This is the code for training an embeddings model with custom data
# This code was executed on an AWS Sagemaker notebook with GPUs (not my local laptop)
# once the model was training, I compressed it can copied it to my local laptop
# (I do not have GPUs for training on my local machine)

In [1]:
# %pip install accelerate==1.3.0 #0.26.0
# %pip install sentence-transformers==3.4.1
# %pip install datasets==3.3.1


import datetime as dt
import pandas as pd
import re
import numpy as np
import json
import random
import ast
import copy

from sentence_transformers import SentenceTransformer, \
                                    SentenceTransformerTrainer, \
                                    SentenceTransformerTrainingArguments, \
                                    losses, \
                                    InputExample
from datasets import Dataset

In [None]:
train_data = pd.read_csv("/home/ec2-user/SageMaker/QnA_model_train_test/QnA_train.csv")

print("Columns:", train_data.columns)
print("Number of recs:", train_data.shape[0])


In [None]:
# split into train and test sets

X_train, X_test = train_test_split(train_data, test_size=0.2, random_state=42)

#save off test set for inference/model metrics
X_test.to_csv("/home/ec2-user/SageMaker/QnA_model_train_test/test_set.csv")


In [None]:
#Build a training set of question and answer pairs

train_set = []
for i,r in X_train.iterrows():
    train_set.append(InputExample(texts=[str(r['answer']), str(r['question'])]))

print(len(train_set))

In [None]:

# Train the embeddings model to data set question and answer pairs 

# load the model
#model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5') # start from huggingface doanloaded pre-trained model
model = SentenceTransformer("/home/ec2-user/SageMaker/QnA_model_train_test/models/final_model") # continue training epoch on same mode

#define the training set
train_dataset = Dataset.from_dict({
    "anchor": train_data['question'].astype(str).to_list(),
    "positive": train_data['answer'].astype(str).to_list(),
})

# define the loss functon
loss = losses.MultipleNegativesRankingLoss(model)

# set the training arguments
model_args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/trained_model_3",
    # Optional training parameters:
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if your GPU can't handle FP16
    bf16=False,  # Set to True if your GPU supports BF16
    # Optional tracking/debugging parameters:
    #eval_strategy="steps",
    #eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    logging_steps=500,
    run_name="fine_tuned_embedding_model"
)

#instanciate the trainer
trainer = SentenceTransformerTrainer(
    model=model,
    args=model_args,
    train_dataset=train_dataset,
    loss=loss,
)

# Train the model
trained_model = trainer.train()

# save the trained model
trainer.save_model("/home/ec2-user/SageMaker/QnA_model_train_test/models/final_model")

#Starting Model Loss -->.106100
#Ending Model Loss -->.005300

In [None]:
# load the trained model
loaded_model = SentenceTransformer('/home/ec2-user/SageMaker/QnA_model_train_test/models/final_model')

# model test: compute some embeddings
sentences = ["This is an example sentence", "run fast through the woods my child", "Anchor 1", 'Positive 1']
embeddings = loaded_model.encode(sentences)
print(embeddings)


In [None]:
# Tar the model and send inference compute

### tar -czf /home/ec2-user/SageMaker/QnA_model_train_test/models/custome_model.tar.gz /home/ec2-user/SageMaker/QnA_model_train_test/models/final_model