In [61]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, losses,SentenceTransformerTrainingArguments,SentenceTransformerTrainer
from transformers import EarlyStoppingCallback
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import matplotlib.pyplot as plt
import numpy as np
from sentence_transformers.training_args import BatchSamplers

In [62]:
df = pd.read_excel('data/FineTsent.xlsx')
df.head()

Unnamed: 0,sent1,sent2,score,normalize
0,ผู้ป่วยชื่ออะไร,ให้คนไข้,2,0.4
1,ไอบูโปรเพน,ไอบูโปรเพน,5,1.0
2,ความแรง400มิลลิกรัม,ความแรง400มิลลิกรัม,5,1.0
3,จำนวน10เม็ด,จำนวน10เม็ด,5,1.0
4,ใช้เพื่อบรรเทาอาการปวด,ใช้เพื่อบรรเทาอาการปวด,5,1.0


In [63]:
df.drop('score', axis=1, inplace=True)
df.rename(columns={'normalize': 'label'}, inplace=True)
df.head()

Unnamed: 0,sent1,sent2,label
0,ผู้ป่วยชื่ออะไร,ให้คนไข้,0.4
1,ไอบูโปรเพน,ไอบูโปรเพน,1.0
2,ความแรง400มิลลิกรัม,ความแรง400มิลลิกรัม,1.0
3,จำนวน10เม็ด,จำนวน10เม็ด,1.0
4,ใช้เพื่อบรรเทาอาการปวด,ใช้เพื่อบรรเทาอาการปวด,1.0


In [64]:
# Ensure all values are strings and drop NaNs
df.dropna(subset=['sent1', 'sent2', 'label'], inplace=True)
df['sent1'] = df['sent1'].astype(str)
df['sent2'] = df['sent2'].astype(str)
print(df.dtypes)

sent1     object
sent2     object
label    float64
dtype: object


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sent1   569 non-null    object 
 1   sent2   569 non-null    object 
 2   label   569 non-null    float64
dtypes: float64(1), object(2)
memory usage: 13.5+ KB


In [66]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [67]:
print(type(dataset['label'][0]))

<class 'float'>


In [68]:
all_dataset = dataset.train_test_split(test_size=0.2, seed=42)

In [69]:
all_dataset

DatasetDict({
    train: Dataset({
        features: ['sent1', 'sent2', 'label'],
        num_rows: 455
    })
    test: Dataset({
        features: ['sent1', 'sent2', 'label'],
        num_rows: 114
    })
})

In [70]:
dataset_val = all_dataset['test']

In [71]:
dataset_val

Dataset({
    features: ['sent1', 'sent2', 'label'],
    num_rows: 114
})

In [72]:
dataset_train = all_dataset['train'].train_test_split(test_size=0.2,seed=42)

In [73]:
dataset_train

DatasetDict({
    train: Dataset({
        features: ['sent1', 'sent2', 'label'],
        num_rows: 364
    })
    test: Dataset({
        features: ['sent1', 'sent2', 'label'],
        num_rows: 91
    })
})

In [74]:
for example in dataset_train['train']:
    if not isinstance(example['sent1'], str) or not isinstance(example['sent2'], str):
        print("Found bad sample:", example)

In [75]:
# --- 3. Model Loading and Setup ---
# Load the pre-trained SentenceTransformer model
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Choose a loss function: CosineSimilarityLoss is suitable for similarity tasks
train_loss = losses.CosineSimilarityLoss(model)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) # Common learning rate for SBERT fine-tuning

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"\nUsing device: {device}")


Using device: cuda


In [80]:
# 5. (Optional) Specify training arguments
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="evaluation_results_trained_model",
    # Optional training parameters:
    num_train_epochs=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    metric_for_best_model="eval_loss",
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="paraphrase-multilingual-MiniLM-L12-v2-find-turning",  # Will be used in W&B if `wandb` is installed
)

In [81]:
#6. (Optional) Create an evaluator & evaluate the base model
ese_acc_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=dataset_val["sent1"],
    sentences2=dataset_val["sent2"],
    scores=dataset_val["label"],
    name="val",
)
ese_acc_evaluator(model)

{'val_pearson_cosine': 0.7967373681726905,
 'val_spearman_cosine': 0.7909745277289721}

In [82]:
early_stopper = EarlyStoppingCallback(
    early_stopping_patience=2, # you can change this value if needed
    early_stopping_threshold=0.05 # you can change this value if needed
)

In [83]:
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=dataset_train['train'],
    eval_dataset=dataset_train['test'],
    loss=train_loss,
    evaluator=ese_acc_evaluator,
    callbacks=[early_stopper],
)
trainer.train()

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss,Val Pearson Cosine,Val Spearman Cosine
100,0.0232,0.031355,0.800857,0.797188
200,0.0107,0.029861,0.798798,0.790765
300,0.0052,0.028922,0.757562,0.75327


TrainOutput(global_step=300, training_loss=0.01305209219455719, metrics={'train_runtime': 107.9047, 'train_samples_per_second': 337.335, 'train_steps_per_second': 21.315, 'total_flos': 0.0, 'train_loss': 0.01305209219455719, 'epoch': 13.043478260869565})

In [86]:
# 8. Save the final model
final_output_dir = f"evaluation_results_trained_model_full_log"
model.save_pretrained(final_output_dir)