In [1]:
!pip install sentence_transformers datasets

Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0

In [2]:
import csv
import random

from sentence_transformers.readers import InputExample

  from tqdm.autonotebook import tqdm, trange


In [3]:

def load_train_kor_sts(filename):
    samples = []
    with open(filename, "rt", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
        for row in reader:
            score = float(row["score"]) / 5.0
            samples.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=score))
    return samples


def load_train_kor_nli(filename):
    data = {}

    def add_sampling(samplingA, samplingB, label):
        if  samplingA not in data:
            data[samplingA] = {"contradiction": set(), "entailment": set(), "neutral": set()}
        data[samplingA][label].add(samplingB)

    with open(filename, "r", encoding="utf-8") as f:
            reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
            for row in reader:
                samplingA = row["sentence1"].strip()
                samplingB = row["sentence2"].strip()
                add_sampling(samplingA, samplingB, row["gold_label"])
                add_sampling(samplingB, samplingA, row["gold_label"])

    samples = []

    for sampling, etc in data.items():
            if len(etc["entailment"]) > 0 and len(etc["contradiction"]) > 0:
                samples.append(
                    InputExample(
                        texts=[
                            sampling,
                            random.choice(list(etc["entailment"])),
                            random.choice(list(etc["contradiction"])),
                        ]
                    )
                )
                samples.append(
                    InputExample(
                        texts=[
                            random.choice(list(etc["entailment"])),
                            sampling,
                            random.choice(list(etc["contradiction"])),
                        ]
                    )
                )
    return samples

In [4]:
import glob
import logging
import math
import os
import random
from datetime import datetime

import numpy as np
import torch
from sentence_transformers import LoggingHandler, SentenceTransformer, datasets, losses, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader

logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)

# Modify as needed
model_path = "Alibaba-NLP/gte-multilingual-base"
max_seq_length = 256
nli_batch_size = 64
sts_batch_size = 8
num_epochs = 10
eval_steps = 1000
learning_rate = 2e-5
seed = 500

def set_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

In [5]:
set_seeds(seed)

model_save_path = os.path.join("output/gte-kor-turbo" + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# Use trust_remote_code=True when loading the SentenceTransformer model
model = SentenceTransformer(model_path, trust_remote_code=True)

# Use the model to set up the rest of the training pipeline
base_model = model._first_module()
pooling_model = models.Pooling(base_model.get_word_embedding_dimension(), pooling_mode="mean")
model = SentenceTransformer(modules=[base_model, pooling_model])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/123k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/611M [00:00<?, ?B/s]

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
nli_load_path = "dataset/KorNLI"
sts_load_path = "dataset/KorSTS"

logging.info("Read KorNLI, STS dataset")

nli_train_datasets = glob.glob(os.path.join(nli_load_path, "*train.ko.tsv"))
dev_sts_path = os.path.join(sts_load_path, "sts-dev.tsv")

In [10]:
nli_train_samples = []
for nli_train_data in nli_train_datasets:
        nli_train_samples += load_train_kor_nli(nli_train_data)

nli_train_dataloader = datasets.NoDuplicatesDataLoader(nli_train_samples, batch_size=nli_batch_size)
nli_train_loss = losses.MultipleNegativesRankingLoss(model)

sts_dataset_path = "dataset/KorSTS"
sts_train_file = os.path.join(sts_dataset_path, "sts-train.tsv")

sts_train_samples = load_train_kor_sts(sts_train_file)
sts_train_dataloader = DataLoader(sts_train_samples, shuffle=True, batch_size=sts_batch_size)
sts_train_loss = losses.CosineSimilarityLoss(model=model)

dev_samples = load_train_kor_sts(dev_sts_path)
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=sts_batch_size, name="sts-dev")

print("Length of NLI data loader:", len(nli_train_dataloader))
print("Length of STS data loader:", len(sts_train_dataloader))
steps_per_epoch = min(len(nli_train_dataloader), len(sts_train_dataloader))

epoch_steps = math.ceil(steps_per_epoch * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("epoch-steps: {}".format(epoch_steps))


Length of NLI data loader: 8885
Length of STS data loader: 719


In [11]:
train_objectives = [(nli_train_dataloader, nli_train_loss), (sts_train_dataloader, sts_train_loss)]
model.fit(
        train_objectives=train_objectives,
        evaluator=dev_evaluator,
        epochs=num_epochs,
        optimizer_params={"lr": learning_rate},
        evaluation_steps=eval_steps,
        warmup_steps=epoch_steps,
        output_path=model_save_path,
    )

    # Load the trained model with trust_remote_code=True
model = SentenceTransformer(model_save_path, trust_remote_code=True)
logging.info("Start benchmark test dataset")

test_file = os.path.join(sts_dataset_path, "sts-test.tsv")
test_samples = load_train_kor_sts(test_file)

test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="sts-test")
test_evaluator(model, output_path=model_save_path)

Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine,Sts-dev Pearson Manhattan,Sts-dev Spearman Manhattan,Sts-dev Pearson Euclidean,Sts-dev Spearman Euclidean,Sts-dev Pearson Dot,Sts-dev Spearman Dot,Sts-dev Pearson Max,Sts-dev Spearman Max
1000,0.229,No log,0.858991,0.858775,0.824528,0.828948,0.826413,0.830631,0.768883,0.791405,0.858991,0.858775
1439,0.229,No log,0.861028,0.862134,0.805702,0.80882,0.807666,0.810938,0.775075,0.793663,0.861028,0.862134
2000,0.1393,No log,0.858028,0.858038,0.807258,0.813699,0.809013,0.815446,0.74978,0.778043,0.858028,0.858038
2878,0.0821,No log,0.854626,0.855881,0.80305,0.808008,0.804047,0.808568,0.753913,0.77891,0.854626,0.855881
3000,0.0843,No log,0.853576,0.853807,0.803296,0.807991,0.804571,0.809131,0.754921,0.779587,0.853576,0.853807
4000,0.0349,No log,0.855968,0.855753,0.796958,0.803958,0.79789,0.804879,0.74273,0.773791,0.855968,0.855753
4317,0.0349,No log,0.855507,0.856369,0.795194,0.802185,0.795821,0.802352,0.74328,0.777752,0.855507,0.856369
5000,0.0195,No log,0.852333,0.852684,0.789693,0.797202,0.790131,0.797484,0.745418,0.778541,0.852333,0.852684
5756,0.0181,No log,0.851779,0.85234,0.783175,0.789252,0.783898,0.790026,0.727303,0.765721,0.851779,0.85234
6000,0.0174,No log,0.854313,0.85452,0.789086,0.795694,0.789951,0.796385,0.74292,0.777523,0.854313,0.85452


Computing widget examples:   0%|          | 0/2 [00:00<?, ?example/s]

{'sts-test_pearson_cosine': 0.8069414831172336,
 'sts-test_spearman_cosine': 0.8093097331207868,
 'sts-test_pearson_manhattan': 0.7249630747888534,
 'sts-test_spearman_manhattan': 0.7326850346990167,
 'sts-test_pearson_euclidean': 0.727283972664176,
 'sts-test_spearman_euclidean': 0.734226546536339,
 'sts-test_pearson_dot': 0.5790210756163109,
 'sts-test_spearman_dot': 0.6552036518432575,
 'sts-test_pearson_max': 0.8069414831172336,
 'sts-test_spearman_max': 0.8093097331207868}