### Fine-tuning Sentence Transformers Models
> Reference : [Training&Fine-Tuning Sentence-Transformer Model](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/95_Training_Sentence_Transformers.ipynb) / [Sentence-Transformers Trainer, TrainingArguments Document](https://sbert.net/docs/package_reference/sentence_transformer/trainer.html#sentencetransformertrainer) <br>
> Model : [google/embeddinggemma-300m](https://huggingface.co/google/embeddinggemma-300m)<br>
> Dataset : [kakao KorSTS](https://github.com/kakaobrain/kor-nlu-datasets)<br>

In [None]:
# sentence-transformers >= 4.56.0
# !pip install -U sentence-transformers

In [None]:
# Download Dataset
# !git clone https://github.com/kakaobrain/kor-nlu-datasets.git ../../data/

In [2]:
import os
os.environ['TOKENIZERS_PARALLELISM']='false' # set parallelism fasle in jupyter env or colab env

In [3]:
from dotenv import load_dotenv
import os
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
WANDB_API_KEY = os.getenv("WANDB_API_KEY")

In [4]:
from huggingface_hub import login
login(token=HF_TOKEN)
import wandb
wandb.login(key=WANDB_API_KEY)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/song/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mskiersong9[0m ([33mskiersong[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
from sentence_transformers import SentenceTransformer
import torch

# Download from the 🤗 Hub
base_model = SentenceTransformer("google/embeddinggemma-300m")

# Run inference with queries and documents
query = "Which planet is known as the Red Planet?"
documents = [
    "Venus is often called Earth's twin because of its similar size and proximity.",
    "Mars, known for its reddish appearance, is often referred to as the Red Planet.",
    "Jupiter, the largest planet in our solar system, has a prominent red spot.",
    "Saturn, famous for its rings, is sometimes mistaken for the Red Planet."
]
query_embeddings = base_model.encode(query)
document_embeddings = base_model.encode(documents)
print(query_embeddings.shape, document_embeddings.shape)
# (768,) (4, 768)

# Compute similarities to determine a ranking
similarities = base_model.similarity(query_embeddings, document_embeddings)
print(similarities)
# tensor([[0.3011, 0.6359, 0.4930, 0.4889]])


(768,) (4, 768)
tensor([[0.6963, 0.8456, 0.7681, 0.7857]])


In [5]:
import os
import pandas as pd
data_dir = '../../data/kor-nlu-datasets/KorSTS/'
train_df = pd.read_csv(os.path.join(data_dir,"sts-train.tsv"), sep='\t',on_bad_lines="skip")
dev_df = pd.read_csv(os.path.join(data_dir,"sts-dev.tsv"), sep='\t',on_bad_lines="skip")
test_df = pd.read_csv(os.path.join(data_dir,"sts-test.tsv"), sep='\t',on_bad_lines="skip")

# check for NaN
print(train_df.isnull().sum())
print(dev_df.isnull().sum())
print(test_df.isnull().sum())

# fillna with empty string
train_df = train_df.fillna("")
print(train_df.isnull().sum().sum())
dev_df = dev_df.fillna("")
print(dev_df.isnull().sum().sum())
test_df = test_df.fillna("")
print(test_df.isnull().sum().sum())

genre        0
filename     0
year         0
id           0
score        0
sentence1    0
sentence2    5
dtype: int64
genre        0
filename     0
year         0
id           0
score        0
sentence1    0
sentence2    1
dtype: int64
genre        0
filename     0
year         0
id           0
score        0
sentence1    0
sentence2    3
dtype: int64
0
0
0


### Check baseline Score for EmbeddingGemma-300M

In [None]:
from sentence_transformers import losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
from sentence_transformers import SentenceTransformer

In [9]:
from datasets import Dataset

def create_hf_dataset_from_df(df: pd.DataFrame) -> Dataset:
    df['label'] = df['score'].apply(lambda x: x/5.0) # regularize score from [0,5] to [0,1]

    # 1. Pandas DataFrame을 기본 Dataset으로 변환
    hf_dataset = Dataset.from_pandas(df)
    
    # 'sentence1'과 'sentence2' 컬럼 이름은 그대로 유지됩니다.
    
    # 필요한 최종 컬럼만 선택
    # Trainer가 요구하는 기본 컬럼 구조: ['sentence1', 'sentence2', 'label']
    hf_dataset = hf_dataset.select_columns(['sentence1', 'sentence2', 'label'])
    
    return hf_dataset
train_hf_dataset = create_hf_dataset_from_df(train_df)
dev_hf_dataset = create_hf_dataset_from_df(dev_df)
test_hf_dataset = create_hf_dataset_from_df(test_df)

In [12]:
test_sentences1 = test_df['sentence1'].tolist()
test_sentences2 = test_df['sentence2'].tolist()
test_scores = test_df['score'].tolist()
test_evaluator = EmbeddingSimilarityEvaluator(
    test_sentences1, 
    test_sentences2, 
    test_scores,
    main_similarity='cosine',
    name='korsts-dev',
)
test_results = test_evaluator(base_model)
test_results

{'korsts-dev_pearson_cosine': 0.35650364903426973,
 'korsts-dev_spearman_cosine': 0.3698784068522297}

### SFT

In [None]:
config = {
    'num_epochs': 3,
    'train_batch_size': 8,
    'gradient_accumulation_steps': 2,
    'model_save_path': './outputs/embedding_gemma_300m_KorSTS',
    'max_steps':800,
}
from argparse import Namespace
config = Namespace(**config)

In [None]:
dev_sentences1 = dev_df['sentence1'].tolist()
dev_sentences2 = dev_df['sentence2'].tolist()
dev_scores = dev_df['score'].tolist()
evaluator = EmbeddingSimilarityEvaluator( # validation evaluator
    dev_sentences1, 
    dev_sentences2, 
    dev_scores, 
    name='korsts-dev',
)

#### Sentence-Transformers Losses
- CosineSimilarityLoss
    - 정의 : 회귀 기반 loss
    - dataset composition : (sentence1, sentence2, similarity score)
    - loss : $MSE(GT,\text{sim}(\mathbf{s1}, \mathbf{s2}))$
    - 특징 : similarity score 를 바탕으로 하는 regression loss.
    - 단점 : s1-s2 문장 쌍만을 활용하기 때문에 배치 내의 다른 문장들과의 negatives를 활용하지 못한다.
- TripletLoss
    - 정의 : 거리 기반 loss
    - dataset composition : (query_sentence, positive_sentence, negative_sentence)
    - loss : $\max(0, ||\mathbf{q} - \mathbf{p}||^2 - ||\mathbf{q}-\mathbf{n}||^2+\alpha) $ >> query-positive 임베딩 벡터 간 거리와 query-negative 임베딩 벡터 간 거리의 차이가 alpha(=margin)보다 클 때, 즉 query가 negative와 더 가까울 때 loss가 발생하여 positive 쪽으로 임베딩하도록 유도한다.
    - 특징 : 두 문장의 상대적인 관게(Q가 N보다 P에 가까워야 함)를 명확히 학습할 수 있다.
    - 단점 : 데이터 구축이 어렵다. 마찬가지로 배치 내의 다른 negative 문장을 활용하지 않으므로 효율이 떨어진다.
    - 발전된 loss : MultipleNegativeRankingLoss
- MultipleNegativesRankingLoss (=InfoNCE)
    - 정의 : batch 기반 cross-entropy loss
    - dataset composition : (sentence1, sentence2) . 두 문장은 항상 의미적으로 '유사'한 문장이다.
    - loss : $-\log\frac{\exp(s(q,d^+)/\tau)}{\sum_{i=1}^N \exp(s(q,d^i)/\tau)}$
    - 

In [None]:
train_loss = losses.CosineSimilarityLoss(model=base_model)

In [None]:
training_args = SentenceTransformerTrainingArguments(
    output_dir=config.model_save_path,
    num_train_epochs=config.num_epochs,
    max_steps=config.max_steps,
    per_device_train_batch_size=config.train_batch_size,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    learning_rate=2e-6,
    lr_scheduler_type='cosine',
    # lr_scheduler_kwargs=None,
    warmup_steps=0,
    #Logging & Saving
    eval_strategy='steps',
    save_strategy='steps',
    load_best_model_at_end=True,
    logging_steps=100,
    eval_steps=100,
    save_steps=100,
    save_total_limit=3,
    # OOM
    auto_find_batch_size=True,
    torch_empty_cache_steps=20,
    bf16=True,
    bf16_full_eval=True,
    # report
    report_to="wandb",
    run_name=f"steps{config.max_steps}_B{config.train_batch_size}_GA{config.gradient_accumulation_steps}",    
    # HF Hub 
    push_to_hub=True,
    hub_model_id='song9/embeddinggemma-300m-KorSTS',
    hub_strategy='end',
    hub_token=HF_TOKEN,
)

In [None]:
trainer = SentenceTransformerTrainer(
    model=base_model,
    args=training_args,
    train_dataset=train_hf_dataset,     # Dataset 
    eval_dataset=dev_hf_dataset,       # Dataset 
    loss=train_loss,            # 손실 함수 (CosineSimilarityLoss)
    evaluator=evaluator               # 평가자 (EmbeddingSimilarityEvaluator)
)

In [None]:
trainer.train()

In [None]:
test_sentences1 = test_df['sentence1'].tolist()
test_sentences2 = test_df['sentence2'].tolist()
test_scores = test_df['score'].tolist()
test_evaluator = EmbeddingSimilarityEvaluator(
    test_sentences1, 
    test_sentences2, 
    test_scores,
    main_similarity='cosine',
    name='korsts-test',
)

In [None]:
test_results = test_evaluator(trainer.model)
test_results

In [None]:
import huggingface_hub
repo_id = "song9/embeddinggemma-300m-KorSTS"
huggingface_hub.create_repo(
    repo_id, 
    exist_ok=True,
    token=HF_TOKEN,
)

In [None]:
trainer.push_to_hub(
    commit_message="Initial Commit",
    token=HF_TOKEN, # hf token
    repo_id=repo_id, # set HF repo id : {username}/{repo_name}
    language=["multilingual","ko"], # must be lowercase and ISO 639-1 format
    license="cc-by-sa-4.0", # must be lowercase
    tags=["STS"],
    model_name=repo_id, # {hf_username}/{repo_name} > used for sample code
    finetuned_from="google/embeddinggemma-300m",
    tasks="Sentence Similarity",
    dataset="kakao/KorSTS",
    exist_ok=True,
)