In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, BinaryClassificationEvaluator
import math
import torch

from models import models, ModelName

# Load and inspect the dataset
csv_file = 'fine_tuning_dataset/all_merged_dataset.csv'
df = pd.read_csv(csv_file)

# Check for missing values
print("Missing values in dataset:\n", df.isnull().sum())

# Drop rows with any missing values
df.dropna(inplace=True)

# Ensure there are no empty strings
df['question'] = df['question'].apply(lambda x: x.strip() if isinstance(x, str) else x)
df['context'] = df['context'].apply(lambda x: x.strip() if isinstance(x, str) else x)

# Verify dataset after cleaning
print("Dataset after cleaning:\n", df.head())

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Ensure the validation dataset has pairs and valid data
print(f"Number of validation examples: {len(val_df)}")
print(f"Sample validation data: {val_df.head()}")

# Manually inspect some examples for a sanity check
for i in range(5):
    print(f"Example {i+1}: Question - {val_df.iloc[i]['question']}, Context - {val_df.iloc[i]['context']}")

# Create InputExamples
train_examples = [InputExample(texts=[row['question'], row['context']]) for _, row in train_df.iterrows()]
val_examples = [InputExample(texts=[row['question'], row['context']]) for _, row in val_df.iterrows()]

print(f"First few training examples: {train_examples[:5]}")
print(f"First few validation examples: {val_examples[:5]}")

# Create DataLoaders
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=8)

# Load the pre-trained model
# model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
model = SentenceTransformer('intfloat/multilingual-e5-small')

# Test embedding generation
sample_texts = [train_examples[0].texts[0], train_examples[0].texts[1]]
embeddings = model.encode(sample_texts)
print("Sample embeddings:", embeddings)

# Use MultipleNegativesRankingLoss for training
train_loss = losses.MultipleNegativesRankingLoss(model)

# Use EmbeddingSimilarityEvaluator for validation
embedding_similarity_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_examples, batch_size=8)

# Use BinaryClassificationEvaluator for a simpler evaluation test
binary_classification_evaluator = BinaryClassificationEvaluator.from_input_examples(val_examples, batch_size=8, name='validation')

# Initial validation to check for proper evaluator functioning
embedding_initial_results = embedding_similarity_evaluator(model)
binary_initial_results = binary_classification_evaluator(model)
print(f"Initial embedding evaluation results: {embedding_initial_results}")
print(f"Initial binary evaluation results: {binary_initial_results}")

# Set device
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
model.to(device)

# Fine-tune the model with early stopping
num_epochs = 50
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)
model_name = ModelName.MULTILINGUAL_MINILM_FINETUNING_EARLY_STOP.value
output_path = models[model_name]['local_dir']

# Early stopping parameters
patience = 5
best_score = float('-inf')
epochs_no_improve = 0

for epoch in range(num_epochs):
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=1,
        steps_per_epoch=len(train_dataloader),
        warmup_steps=warmup_steps,
        output_path=output_path,
        optimizer_params={'lr': 1e-4}  # Increase learning rate if necessary
    )

    embedding_results = embedding_similarity_evaluator(model, output_path)
    binary_results = binary_classification_evaluator(model, output_path)
    print(f"Epoch {epoch+1} embedding evaluation results: {embedding_results}")
    print(f"Epoch {epoch+1} binary evaluation results: {binary_results}")

    # Use validation_manhattan_accuracy as the metric for early stopping
    score = binary_results.get('validation_manhattan_accuracy', float('-inf'))
    
    print(f"=====> embedding score {embedding_results}, binary score {score}, best score {best_score}")

    if score > best_score:
        best_score = score
        epochs_no_improve = 0
        model.save(output_path)  # Save the best model
        print(f"New best score: {best_score}. Model saved.")
    else:
        epochs_no_improve += 1
        print(f"No improvement. {epochs_no_improve}/{patience} patience periods passed.")
    
    if epochs_no_improve >= patience:
        print(f"Early stopping triggered after {epoch + 1} epochs.")
        break

print(f"Model fine-tuning complete. Model saved to {output_path}.")

  from tqdm.autonotebook import tqdm, trange


Missing values in dataset:
 question    0
context     0
dtype: int64
Dataset after cleaning:
                                             question  \
0          MyITS saya bermasalah, bisa lapor kemana?   
1  Saya ingin mengurus '.....' siapa tendik yang ...   
2                         Cara mendapatkan transkrip   
3  Cara mendapatkan surat keterangan aktif mahasiswa   
4  Bagaimana mendapatkan translasi ke dalam Bahas...   

                                             context  
0  Silakan ajukan tiket ke DPTSI di https://servi...  
1  Temui tendik sesuai bidangnya:\n\n- Persuratan...  
2  Buka SIM Akademik, masuk ke menu LAPORAN -> TR...  
3  Buka SIM Akademik, masuk ke menu SURAT MAHASIS...  
4  Translasi dilayani oleh BURB.\nKeluhan/permint...  
Number of validation examples: 71
Sample validation data:                                               question  \
296  Apa kriteria pekerjaan yang bisa dikerjakan ol...   
81                  Jelaskan tentang program Exchange?   
77   Ap



Initial embedding evaluation results: {'pearson_cosine': nan, 'spearman_cosine': nan, 'pearson_manhattan': nan, 'spearman_manhattan': nan, 'pearson_euclidean': nan, 'spearman_euclidean': nan, 'pearson_dot': nan, 'spearman_dot': nan, 'pearson_max': nan, 'spearman_max': nan}
Initial binary evaluation results: {'validation_cosine_accuracy': 0.9859154929577465, 'validation_cosine_accuracy_threshold': 0.9445478916168213, 'validation_cosine_f1': 0, 'validation_cosine_f1_threshold': 0, 'validation_cosine_precision': 0, 'validation_cosine_recall': 0, 'validation_cosine_ap': -0.0, 'validation_dot_accuracy': 0.9859154929577465, 'validation_dot_accuracy_threshold': 0.9445480108261108, 'validation_dot_f1': 0, 'validation_dot_f1_threshold': 0, 'validation_dot_precision': 0, 'validation_dot_recall': 0, 'validation_dot_ap': -0.0, 'validation_manhattan_accuracy': 0.9859154929577465, 'validation_manhattan_accuracy_threshold': 5.143091678619385, 'validation_manhattan_f1': 0, 'validation_manhattan_f1_thr

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 48%|████▊     | 38/80 [00:10<00:10,  3.93it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB. GPU 

In [None]:
# Example evaluation - you may need to adapt according to your dataset
evaluation_examples = [
    ("How are you?", "How do you do?"),
    ("What is your name?", "What's your name?"),
    ("Where do you live?", "Where is your home located?")
]

model = SentenceTransformer(output_path)

for pair in evaluation_examples:
    embeddings = model.encode(pair)
    similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    print(f"Similarity between: '{pair[0]}' and '{pair[1]}' is {similarity.item():.4f}")