# Install Necessary Packages

In [None]:
!pip install transformers[torch]

Collecting transformers[torch]
  Using cached transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers[torch])
  Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers[torch])
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting torch>=2.0 (from transformers[torch])
  Using cached torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting accelerate>=0.26.0 (from transformers[torch])
  Using cached accelerate-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting networkx (from torch>=2.0->transformers[torch])
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting sympy==1.13.1 (from torch>=2.0->transformers[torch])
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch>=2.0->transformers[torch])
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6

In [None]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-3.4.1


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.13-cp312-cp312-win_amd64.whl.metadata (8.0 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.5.0-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Downloading aiosign

In [None]:
from sentence_transformers import SentenceTransformer
import torch
from datasets import load_dataset, Dataset


#   Load Dataset

More infromation about the dataset over here: # https://sbert.net/docs/sentence_transformer/training_overview.html#dataset


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv("df_entrenamiento3.csv")

# Dividir el DataFrame en train+eval y test
train_eval_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Dividir train+eval en train y eval
train_data, eval_data = train_test_split(train_eval_data, test_size=0.2, random_state=42)

# Convertir los DataFrames a Datasets de Hugging Face
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)
test_dataset = Dataset.from_pandas(test_data)

# Verificar el tamaño de los conjuntos
print(f"Tamaño del conjunto de entrenamiento: {len(train_dataset)}")
print(f"Tamaño del conjunto de evaluación: {len(eval_dataset)}")
print(f"Tamaño del conjunto de prueba: {len(test_dataset)}")

In [None]:
train_dataset = train_dataset.remove_columns(['__index_level_0__'])
test_dataset = test_dataset.remove_columns(['__index_level_0__'])
eval_dataset = eval_dataset.remove_columns(['__index_level_0__'])

# Load Model

In [None]:
# 1. Load a model to finetune with 2. (Optional) model card data

#popular embedding models:
#https://huggingface.co/nomic-ai/nomic-embed-text-v1
#https://huggingface.co/BAAI/bge-large-en

model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


# Setting up Training Arguments

In [None]:
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import CosineSimilarityLoss, CoSENTLoss
from sentence_transformers.training_args import BatchSamplers



In [None]:
# 3. Define a loss function
#loss = CosineSimilarityLoss(model)
loss = CoSENTLoss(model)


In [None]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/fine-tune-1",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=160,
    per_device_eval_batch_size=160,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    eval_strategy="steps",
    eval_steps=10,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
    run_name="mpnet-base-all-nli-triplet",  # Will be used in W&B if `wandb` is installed
)

# Train

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss
)


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
10,No log,3.631134
20,No log,3.142216
30,No log,3.196457
40,No log,3.332477
50,No log,3.454187
60,No log,3.599512
70,No log,3.780658
80,No log,3.947933
90,No log,3.614004


# Test - Model any good?

In [None]:

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
# Initialize the evaluator
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=eval_dataset["sentence1"],
    sentences2=eval_dataset["sentence2"],
    scores=eval_dataset["score"],
    name="sts_dev",
)
results = dev_evaluator(model)
'''
EmbeddingSimilarityEvaluator: Evaluating the model on the sts-dev dataset:
Cosine-Similarity :  Pearson: 0.8806 Spearman: 0.8810
'''
print(dev_evaluator.primary_metric)
# => "sts_dev_pearson_cosine"
print(results[dev_evaluator.primary_metric])
# => 0.881019449484294

sts_dev_spearman_cosine
0.7853352738169416


In [None]:

# 8. Save the trained model
model.save_pretrained("models/all-distilroberta-v1/version3")

# 9. (Optional) Push it to the Hugging Face Hub
#model.push_to_hub("mpnet-base-all-nli-triplet")