In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [6]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models
from torch.utils.data import DataLoader
import pandas as pd
import random
import logging
from sentence_transformers import LoggingHandler
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from sklearn.model_selection import train_test_split

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])


df = pd.read_csv("/content/sample_data/question_answer_constrast.csv")


train_examples = []

for _, row in df.iterrows():
    question = row["question"]
    correct = row["correct_answer"]
    train_examples.append(InputExample(texts=[question, correct], label=1.0))

    for col in df.columns:
        if col.startswith("incorrect") and pd.notna(row[col]):
            incorrect = row[col]
            train_examples.append(InputExample(texts=[question, incorrect], label=0.0))


train_data, val_data = train_test_split(train_examples, test_size=0.1, random_state=42)
sentences1 = [ex.texts[0] for ex in val_data]
sentences2 = [ex.texts[1] for ex in val_data]
scores = [ex.label for ex in val_data]

evaluator = BinaryClassificationEvaluator(sentences1, sentences2, scores)


model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

num_epochs = 50
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=100,
    warmup_steps=warmup_steps,
    show_progress_bar=True,
    save_best_model=True,
    output_path="fine-tuned-bi-encoder"
)

model.save("fine-tuned-bi-encoder")
print("✅ Mô hình đã được huấn luyện và lưu tại: fine-tuned-bi-encoder")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Cosine Accuracy,Cosine Accuracy Threshold,Cosine F1,Cosine F1 Threshold,Cosine Precision,Cosine Recall,Cosine Ap,Cosine Mcc
28,No log,No log,0.918367,0.6020486,0.5,0.6020486,1.0,0.333333,0.512946,0.552236
56,No log,No log,0.918367,0.40346307,0.5,0.40346307,1.0,0.333333,0.521131,0.552236
84,No log,No log,0.918367,0.54548955,0.555556,0.3580036,0.416667,0.833333,0.585179,0.511146
100,No log,No log,0.918367,0.44923937,0.571429,0.32903025,0.5,0.666667,0.574901,0.508762
112,No log,No log,0.918367,0.5252792,0.5,0.5252792,1.0,0.333333,0.524316,0.552236
140,No log,No log,0.918367,0.6026856,0.5,0.6026856,1.0,0.333333,0.541239,0.552236
168,No log,No log,0.918367,0.6564204,0.526316,0.34020406,0.384615,0.833333,0.540028,0.4806
196,No log,No log,0.918367,0.7139117,0.571429,0.37007442,0.5,0.666667,0.610673,0.508762
200,No log,No log,0.918367,0.73137796,0.571429,0.37522954,0.5,0.666667,0.609259,0.508762
224,No log,No log,0.918367,0.779562,0.571429,0.4380617,0.5,0.666667,0.586566,0.508762


Trainer is attempting to log a value of "0.4492393732070923" of type <class 'numpy.float32'> for key "eval/cosine_accuracy_threshold" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "0.32903024554252625" of type <class 'numpy.float32'> for key "eval/cosine_f1_threshold" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "0.7313779592514038" of type <class 'numpy.float32'> for key "eval/cosine_accuracy_threshold" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "0.37522953748703003" of type <class 'numpy.float32'> for key "eval/cosine_f1_threshold" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value o

✅ Mô hình đã được huấn luyện và lưu tại: fine-tuned-bi-encoder
