In [11]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer,  models, util
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
from pathlib import Path
from typing import Iterable, Dict
from torch import nn, Tensor
import torch
from datetime import datetime


In [4]:
dataset = Path("~/Datasets/SRBendding").expanduser()
dataset.mkdir(parents=True, exist_ok=True)

dataset

PosixPath('/home/selena/Datasets/SRBendding')

In [5]:
df_train = load_dataset("stsb_multi_mt", name="fr", split="train", cache_dir=dataset)
df_valid = load_dataset("stsb_multi_mt", name="fr", split="dev", cache_dir=dataset)
df_test = load_dataset("stsb_multi_mt", name="fr", split="test", cache_dir=dataset)

In [9]:
# for df in df_train:
#     print(df)  # {'sentence1': 'Un avion est en train de décoller.', 'sentence2': 'Un avion est en train de décoller.', 'similarity_score': 5.0}


In [7]:
def convert_dataset(dataset):
    dataset_samples=[]
    for df in dataset:
        score = float(df['similarity_score'])/5.0  # Normalize score to range 0 ... 1
        # sta bi ovo bilo kod nas.. mi nemamo score ili da pravimo score, da napravimo ukrstanja
        inp_example = InputExample(texts=[df['sentence1'], 
                                    df['sentence2']], label=score)
        dataset_samples.append(inp_example)
    return dataset_samples

train_samples = convert_dataset(df_train)
dev_samples = convert_dataset(df_valid)
test_samples = convert_dataset(df_test)

# Convert the dataset to a DataLoader ready for training
batch_size = 16
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size)


cos_score_transformation: This is a transformation function applied to the cosine similarity score before calculating the loss. By default, it is nn.Identity(), which means no transformation is applied.  

sentence_features: This is an iterable containing two dictionaries, each representing the features of a sentence (typically a tokenized version of the sentence). The model processes these features to generate sentence embeddings.  

labels: This is a tensor containing the target similarity scores (labels) for each pair of sentences.  


za poslednju liniju koda  
The transformed cosine similarity (output) is compared to the target labels using the loss function (self.loss_fct). The .view(-1) ensures that the label tensor is flattened if necessary.  

In [10]:
class CosineSimilarityLoss(nn.Module):
    """
    CosineSimilarityLoss expects, that the InputExamples consists of two texts and a float label.
    It computes the vectors u = model(input_text[0]) and v = model(input_text[1]) and measures the cosine-similarity between the two.
    Minimizes the following loss: 
                   ||input_label - cos_score_transformation(cosine_sim(u,v))||_2.
    :param model: SentenceTranformer model
    :param loss_fct: loss function is used to compare the cosine_similartiy(u,v) with the input_label. 
                  MSE = ||input_label - cosine_sim(u,v)||_2
    :param cos_score_transformation: The cos_score_transformation function is applied on top of cosine_similarity
    """
    def __init__(self, model: SentenceTransformer, 
                 loss_fct = nn.MSELoss(), # je l treba loss neki drugi
                 cos_score_transformation=nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.model = model
        self.loss_fct = loss_fct
        self.cos_score_transformation = cos_score_transformation

    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
        embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
        output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1]))
        return self.loss_fct(output, labels.view(-1))

In [None]:
model_name =  "camembert/camembert-large"
model_save_path = 'output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
max_seq_length = 128
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
"""Performs pooling (max or mean) on the token embeddings.
  Iit generates from a variable sized sentence a fixed sized sentence embedding, 
  allows to use the CLS token if it is returned by the underlying word embedding model.
  We can concatenate multiple poolings together.
  - word_embedding_dimension: Dimensions for the word embeddings
  - pooling_mode_cls_token: Use the first token (CLS token) as text representations
  - pooling_mode_max_tokens: Use max in each dimension over all tokens.
  - pooling_mode_mean_tokens: Perform mean-pooling
  """
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False,
                               pooling_mode_mean_tokens=True)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])