In [22]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

In [26]:
df = pd.read_excel('data/FineTsent.xlsx')
df.head()

Unnamed: 0,sent1,sent2,score,normalize
0,ผู้ป่วยชื่ออะไร,ให้คนไข้,2,0.4
1,ไอบูโปรเพน,ไอบูโปรเพน,5,1.0
2,ความแรง400มิลลิกรัม,ความแรง400มิลลิกรัม,5,1.0
3,จำนวน10เม็ด,จำนวน10เม็ด,5,1.0
4,ใช้เพื่อบรรเทาอาการปวด,ใช้เพื่อบรรเทาอาการปวด,5,1.0


In [27]:
df.drop('score', axis=1, inplace=True)
df.head()

Unnamed: 0,sent1,sent2,normalize
0,ผู้ป่วยชื่ออะไร,ให้คนไข้,0.4
1,ไอบูโปรเพน,ไอบูโปรเพน,1.0
2,ความแรง400มิลลิกรัม,ความแรง400มิลลิกรัม,1.0
3,จำนวน10เม็ด,จำนวน10เม็ด,1.0
4,ใช้เพื่อบรรเทาอาการปวด,ใช้เพื่อบรรเทาอาการปวด,1.0


In [28]:
train_examples_raw = [InputExample(texts=[row.sent1, row.sent2], label=float(row.normalize))
                      for row in df.itertuples(index=False)]
# for row in df.itertuples(index=False):
#     print(row)

In [29]:
train_examples, temp_examples = train_test_split(train_examples_raw, test_size=0.3, random_state=42)
val_examples, test_examples = train_test_split(temp_examples, test_size=0.5, random_state=42) # 50% of temp for val, 50% for test

print(f"\nTrain examples: {len(train_examples)}")
print(f"Validation examples: {len(val_examples)}")
print(f"Test examples: {len(test_examples)}")


Train examples: 398
Validation examples: 85
Test examples: 86


In [30]:
# Create DataLoaders for each split
batch_size = 16
# When using model.fit(), it internally handles the DataLoader creation and collation for InputExample objects.
# So, we pass the list of InputExample objects directly to train_objectives for model.fit.
# However, for evaluators, we still need DataLoaders.
# For validation and test sets, we will pass them to the evaluator directly.
train_dataloader_for_loss_calc = DataLoader(train_examples, batch_size=batch_size, shuffle=True) # Used for logging train loss in a custom way if needed, but not directly by model.fit for its training steps

In [31]:
# --- 3. Model Loading and Setup ---
# Load the pre-trained SentenceTransformer model
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Choose a loss function: CosineSimilarityLoss is suitable for similarity tasks
train_loss = losses.CosineSimilarityLoss(model)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) # Common learning rate for SBERT fine-tuning

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"\nUsing device: {device}")


Using device: cuda


In [34]:
import os

# --- 4. Training using model.fit() with Evaluator for Metrics ---
# This resolves the TypeError and provides validation metrics.
# model.fit() handles the internal DataLoader creation and batching of InputExample objects.

epochs = 100
warmup_steps = 100

# Define evaluation output path
evaluation_output_path = "evaluation_results_trained_model" # Changed output path to avoid conflicts
os.makedirs(evaluation_output_path, exist_ok=True)

# Create an evaluator for the validation set
# IMPORTANT: EmbeddingSimilarityEvaluator does NOT take `main_score_function` as an argument.
# It automatically calculates various metrics including MSE, Pearson, Spearman.
# We will read MSE from the generated CSV.
val_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    val_examples, # This is a list of InputExample objects
    name='val', # Name for the evaluation set
)

print("\nStarting training using model.fit()...")
print("Validation metrics will be saved to CSV after each epoch.")

# Train the model using model.fit()
# It will automatically handle the DataLoader and collate_fn for InputExample objects
model.fit(
    train_objectives=[(train_dataloader_for_loss_calc, train_loss)], # train_dataloader_for_loss_calc is used by model.fit internally
    evaluator=val_evaluator, # Pass the validation evaluator
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path=evaluation_output_path, # Save model and evaluation results
    evaluation_steps=500, # Evaluate after each epoch
    show_progress_bar=True # Show progress bar during training
)
print("Training complete! Model and evaluation results saved.")


Starting training using model.fit()...
Validation metrics will be saved to CSV after each epoch.


Step,Training Loss,Validation Loss,Val Pearson Cosine,Val Spearman Cosine
25,No log,No log,0.761612,0.757321
50,No log,No log,0.758956,0.748845
75,No log,No log,0.757598,0.747781
100,No log,No log,0.776346,0.773721
125,No log,No log,0.76399,0.760706
150,No log,No log,0.75937,0.748343
175,No log,No log,0.770147,0.770497
200,No log,No log,0.763349,0.756518
225,No log,No log,0.758995,0.749176
250,No log,No log,0.768813,0.756006


Training complete! Model and evaluation results saved.


In [None]:
from sentence_transformers import InputExample

# Convert data into InputExample objects with a label
train_examples = [InputExample(texts=[row.sent1, row.sent2], label=row.normalize) for row in df.itertuples(index=False)]

print(train_examples)  # Debugging: Ensure examples are correctly created

[<sentence_transformers.readers.InputExample.InputExample object at 0x0000020171DC56A0>, <sentence_transformers.readers.InputExample.InputExample object at 0x0000020171D37080>, <sentence_transformers.readers.InputExample.InputExample object at 0x0000020171C61E50>, <sentence_transformers.readers.InputExample.InputExample object at 0x00000201487E1100>, <sentence_transformers.readers.InputExample.InputExample object at 0x0000020171A7A540>, <sentence_transformers.readers.InputExample.InputExample object at 0x00000201715D9FA0>, <sentence_transformers.readers.InputExample.InputExample object at 0x000002012FE911F0>, <sentence_transformers.readers.InputExample.InputExample object at 0x0000020171DF6240>, <sentence_transformers.readers.InputExample.InputExample object at 0x0000020171DF6150>, <sentence_transformers.readers.InputExample.InputExample object at 0x0000020171DF52B0>, <sentence_transformers.readers.InputExample.InputExample object at 0x0000020171DF4A70>, <sentence_transformers.readers.

In [5]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, losses

# Load model
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Create DataLoader
train_dataloader = DataLoader(train_examples, batch_size=16, shuffle=True)

# Choose a loss function
train_loss = losses.CosineSimilarityLoss(model)

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=100,
    warmup_steps=100
)
# Save the trained model
model.save("trained_model")
print("Model saved successfully!")

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.0228
1000,0.0022
1500,0.0015
2000,0.0012
2500,0.001
3000,0.0009
3500,0.0007


Model saved successfully!


In [6]:
from sentence_transformers import SentenceTransformer

# Load the trained model
model = SentenceTransformer("trained_model")
print("Model loaded successfully!")
model.eval()

Model loaded successfully!


SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [7]:
from sentence_transformers.util import cos_sim

# Sample sentences for evaluation
sentences = [
    "ใช้สำหรับหยอดหูเพื่อรักษาอาการติดเชื้อที่หู",
    "ใช้รักษาหูชั้นกลางอักเสบ",
    "ใช้รักษาอาการติดเชื้อที่หู"
]

# Generate embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

# Compute cosine similarity
similarity_matrix = cos_sim(embeddings, embeddings)

print("Cosine Similarity Matrix:\n", similarity_matrix)

Cosine Similarity Matrix:
 tensor([[1.0000, 0.8284, 0.6061],
        [0.8284, 1.0000, 0.6281],
        [0.6061, 0.6281, 1.0000]], device='cuda:0')


In [8]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sentence_transformers.util import cos_sim

# Extract sentences and scores
sentences1 = df['sent1']
sentences2 = df['sent2']
human_scores = df['normalize']

# Compute embeddings
sent1_emb = model.encode(sentences1, convert_to_tensor=True)
sent2_emb = model.encode(sentences2, convert_to_tensor=True)

# Compute cosine similarities (full matrix)
cosine_similarities = cos_sim(sent1_emb, sent2_emb).cpu().numpy()

# Extract diagonal values (correct pairwise similarity)
cosine_similarities = np.diag(cosine_similarities)

# Compute Mean Squared Error (MSE)
mse = mean_squared_error(human_scores, cosine_similarities)

print(f"✅ Mean Squared Error (MSE): {mse:.4f}")

✅ Mean Squared Error (MSE): 0.0005


In [9]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sentence_transformers.util import cos_sim

# Extract sentences and scores
sentences1 = df['sent1']
sentences2 = df['sent2']
human_scores = df['normalize']

# Compute embeddings
sent1_emb = model.encode(sentences1, convert_to_tensor=True)
sent2_emb = model.encode(sentences2, convert_to_tensor=True)

# Compute cosine similarities (full matrix)
cosine_similarities = cos_sim(sent1_emb, sent2_emb).cpu().numpy()

# Extract diagonal values (correct pairwise similarity)
cosine_similarities = np.diag(cosine_similarities)

# Compute Mean Squared Error (MSE)
mse = mean_squared_error(human_scores, cosine_similarities)

print(f"✅ Mean Squared Error (MSE): {mse}")

✅ Mean Squared Error (MSE): 0.0005117414885813106
