In [11]:
import logging
import math
import random
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import List, Tuple

import pandas
import pyarrow.parquet as pq
import sentence_transformers.losses as losses
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    models
)
from sentence_transformers.evaluation import (
    EmbeddingSimilarityEvaluator,
    SimilarityFunction,
)
from sentence_transformers.readers import InputExample
from transformers import AutoTokenizer, TrainerCallback, TrainerControl, TrainerState

# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO)


class QueryType(Enum):
    SHORT = "short_query"
    MEDIUM = "medium_query"
    LONG = "long_query"


def load_df(file: Path) -> pandas.DataFrame:
    loaded_table = pq.read_table(file)
    return loaded_table.to_pandas()


def convert_dataset(
    dataframe: pandas.DataFrame, question_type: str
) -> List[InputExample]:
    dataset_samples = []
    for _, row in dataframe.iterrows():        
        score = float(row["scores"][question_type]) / 5.0
        sample = InputExample(texts=[row["context"], row[question_type]], label=score)
        dataset_samples.append(sample)
    return dataset_samples


def convert_to_hf_dataset(input_examples: List[InputExample]) -> Dataset:
    # Convert each InputExample into a dictionary
    data_dict = {
        "sentence1": [ex.texts[0] for ex in input_examples],
        "sentence2": [ex.texts[1] for ex in input_examples],
        "score": [ex.label for ex in input_examples],
    }

    # Create a Hugging Face Dataset
    return Dataset.from_dict(data_dict)


def get_train_and_eval_datasets(
    dataset_name: Path,
) -> Tuple[Dataset, Dataset, Dataset, List]:
    # NOTE francuzi su 70:15:15 ovde je 80:10:10
    df = load_df(file=dataset_name)
    training_samples = convert_dataset(df, QueryType.LONG.value)

    random.shuffle(training_samples)

    # Manually split the dataset while retaining the original structure
    dataset_size = len(training_samples)
    train_size = int(0.8 * dataset_size)
    dev_size = int(0.1 * dataset_size)

    train_samples = training_samples[:train_size]
    dev_samples = training_samples[train_size : train_size + dev_size]
    eval_samples = training_samples[train_size + dev_size :]

    # Convert lists to Hugging Face Datasets
    train_dataset = convert_to_hf_dataset(train_samples)
    dev_dataset = convert_to_hf_dataset(dev_samples)
    eval_dataset = convert_to_hf_dataset(eval_samples)

    return train_dataset, dev_dataset, eval_dataset, eval_samples


def make_sentence_transformer(
    model_name: str, max_seq_length: int = 512
) -> SentenceTransformer:
    # word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
    # # Apply mean pooling to get one fixed sized sentence vector
    # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
    #                             pooling_mode_cls_token=False,
    #                             pooling_mode_max_tokens=False,
    #                             pooling_mode_mean_tokens=True)
    # return SentenceTransformer(modules=[word_embedding_model, pooling_model])
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.model_max_length = max_seq_length  # Set the max length for the model
    tokenizer.padding_side = (
        "right"  # You can set "left" if you want to pad on the left side
    )
    # tokenizer.pad_token = tokenizer.eos_token  # Ensure the pad token is set
    model = SentenceTransformer(model_name)
    # Add the padding and truncation to the encode method
    model.tokenizer = tokenizer
    model.tokenizer_kwargs = {
        "padding": "max_length",
        "truncation": True,
        "max_length": max_seq_length,
        "return_tensors": "pt",  # Assuming you want PyTorch tensors as output
    }
    return model

class EvalLoggingCallback(TrainerCallback):
    def on_evaluate(self, args, state: TrainerState, control: TrainerControl, **kwargs):
        print("AAAAAAAAA")
        eval_output = state.log_history[-1]  # Last logged evaluation metrics
        log_file = f"evaluation_logs/eval_epoch_{state.epoch}.txt"
        with open(log_file, "a") as f:
            f.write(f"Evaluation results at epoch {state.epoch}:\n")
            f.write(str(eval_output) + "\n")
            f.write("-" * 50 + "\n")


def train_a_model(
    sentence_transformer: SentenceTransformer,
    args: SentenceTransformerTrainingArguments,
    train_dataset,
    eval_dataset,
):
    train_loss = losses.CosineSimilarityLoss(model=sentence_transformer)
    # train_loss = losses.MatryoshkaLoss(
    #     sentence_transformer, train_loss, [768, 512, 256, 128, 64]
    # )
    eval_path = Path(args.output_dir)
    # # 6. (Optional) Create an evaluator & evaluate the base model
    dev_evaluator = make_evaluator(eval_dataset, sentence_transformer, eval_path.parent)

    # 7. Create a trainer & train
    trainer = SentenceTransformerTrainer(
        model=sentence_transformer,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        loss=train_loss,
        evaluator=dev_evaluator,
        callbacks=[EvalLoggingCallback()],
    )
    trainer.train()

    # # (Optional) Evaluate the trained model on the test set
    make_evaluator(eval_dataset, sentence_transformer, eval_path.parent)

    # 8. Save the trained model
    # TODO da li ovako cuvati
    sentence_transformer.save_pretrained(f"{eval_path.parent}/final_model")

    # 9. (Optional) Push it to the Hugging Face Hub
    # model.push_to_hub("mpnet-base-all-nli-triplet")


def make_evaluator(dataset, sentence_transformer, savePath: Path):
    dev_evaluator = EmbeddingSimilarityEvaluator(
        sentences1=dataset["sentence1"],
        sentences2=dataset["sentence2"],
        scores=dataset["score"],
        main_similarity=SimilarityFunction.COSINE,
        name="sts-dev",
        write_csv=True,
    )
    result_path = Path(f"output/{savePath.name}/eval/")
    result_path.mkdir(exist_ok=True)
    dev_evaluator(model=sentence_transformer, output_path=result_path)
    return dev_evaluator




In [None]:

def train_bi_encoder(
    num_epochs, batch_size, model_name, train_dataset, eval_dataset, model_save_path
):
    warmup_steps = math.ceil(len(train_dataset) * num_epochs * 0.1)

    args = SentenceTransformerTrainingArguments(
        # Required parameter:
        output_dir=model_save_path,
        # Optional training parameters:
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=2e-5,
        warmup_ratio=0.1,
        fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
        bf16=False,  # Set to True if you have a GPU that supports BF16
        # batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
        # Optional tracking/debugging parameters:
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,
        logging_steps=100,
        run_name="proba",  # Will be used in W&B if `wandb` is installed
        warmup_steps=warmup_steps,
        load_best_model_at_end=True,  # Automatically load the best model at the end of training
        metric_for_best_model="eval_loss",  # Assuming you're using loss as the evaluation metric
        greater_is_better=False,
        disable_tqdm=False,
        # KeyError: "The `metric_for_best_model` training argument is set to 'eval_cosine_loss', which is not found in the evaluation metrics. The available evaluation metrics are: ['eval_loss', 'eval_sts-dev_pearson_cosine', 'eval_sts-dev_spearman_cosine', 'eval_sts-dev_pearson_manhattan', 'eval_sts-dev_spearman_manhattan', 'eval_sts-dev_pearson_euclidean', 'eval_sts-dev_spearman_euclidean', 'eval_sts-dev_pearson_dot', 'eval_sts-dev_spearman_dot', 'eval_sts-dev_pearson_max', 'eval_sts-dev_spearman_max', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch']. Consider changing the `metric_for_best_model` via the TrainingArguments."
    )
    train_a_model(
        sentence_transformer=make_sentence_transformer(model_name),
        args=args,
        eval_dataset=eval_dataset,
        train_dataset=train_dataset,
    )

In [None]:
def main_pipeline(
    num_epochs: int, batch_size: int, model_name: str, dataset_name: Path
):
    train_dataset, dev_dataset, eval_dataset, _ = get_train_and_eval_datasets(
        dataset_name
    )
    model_save_path = Path(
        f'output/bi_encoder_{datetime.now().strftime("%d-%m-%Y_%H-%M-%S")}/model'
    )
    model_save_path.parent.mkdir(exist_ok=True, parents=True)
    train_bi_encoder(
        num_epochs, batch_size, model_name, train_dataset, eval_dataset, model_save_path
    )




## Testing

In [4]:
data = load_df("datasets/train.parquet")
data

Unnamed: 0,context,short_query,medium_query,long_query,keywords,scores
0,Kao da je nečastivi bio u meni tako se pomami...,Ko je bio pred vratima?,Zašto su vrata od košare bila otvorena i zaklj...,Kako se osećao autor kada je video čoveka pred...,"[nečastivi, košara, otvorena, brava, večerati]","{'long_query': 3, 'medium_query': 5, 'short_qu..."
1,„Jesam ga s obadve ruke obuhvatio i rastrgao ...,Kako je pasji rod umoriti?,Kako je autor opisao svoj odnos prema psu?,Kako je autor reagovao nakon što je rastrgao p...,"[pasji rod, jastreb, vaška, rastrgati, umoriti]","{'long_query': 2, 'medium_query': 3, 'short_qu..."
2,"Zatrčim se k njemu, no, čini mi se, on se nad...",Da li je dočekao kolač?,Kako je reagovao kada sam ga udarila drenovačo...,Kako je izgledao njegov izraz lica kada sam ga...,"[kolač, drenovača, udarac, glava, sreća]","{'long_query': 5, 'medium_query': 4, 'short_qu..."
3,„Al’ kako to da si sad tako siromah? Ta da ste...,Zašto si sada siromašan?,Kako si potrošio sve što si imao ako si samo s...,Kako je došlo do toga da si potrošio sve što s...,"[siromah, sedeli, jeli, pili, nesreća]","{'long_query': 3, 'medium_query': 5, 'short_qu..."
4,"“ „Ta zato — reče opet Sremac — toliko imanje,...",Kakvo je bogatstvo u zemlji?,Koje su karakteristike imanja i stoke u našoj ...,Kako se opisuje bogatstvo imanja i stoke u naš...,"[imanje, bogatstvo, zemlja, njive, stoka]","{'long_query': 5, 'medium_query': 5, 'short_qu..."
5,ŠTAMPANO U DRŽ. ŠTAMPARIJI KRALjEVINE SRBIJE A...,Koje su karakteristike Srpskog Vojvodstva?,Kako se opisuje zemlja Srpsko Vojvodstvo i koj...,Koje prirodne lepote i resursi se mogu pronaći...,"[Srpsko Vojvodstvo, Bačka, Banat, livade, konji]","{'long_query': 5, 'medium_query': 5, 'short_qu..."
6,"„Pripovedi nam, pripovedi kako si postradao —...",Kako su patili?,Možete li nam ispričati više o zapovedanju i p...,Kako su naši ljudi patili od zapovedanja i ned...,"[povreda, patnja, zapovedanje, gospoda, razume...","{'long_query': 3, 'medium_query': 5, 'short_qu..."
7,Prva vetrenjača pojavila se u Iranu u 9. veku....,Ko se smatra ocem algebre?,Koje naučne oblasti su bile značajne za Irance...,Kada se prva vetrenjača pojavila i kako su Ira...,"[vetrenjača, Iran, 9. vek, nauka, algebra]","{'long_query': 3, 'medium_query': 4, 'short_qu..."
8,Veliki broj iranskih jela kombinacija su riže...,Koje su karakteristike iranskih jela?,Kako se pripremaju tradicionalna iranska jela ...,Koje su osnovne sastojke i začine koji se kori...,"[iranska jela, riža, meso, češnjak, povrće]","{'long_query': 4, 'medium_query': 5, 'short_qu..."
9,"Iz neke navike je i ovaj prošireni ""-{ASCII}-""...",Šta su kodne strane?,Kako su kodne strane povezane sa ASCII karakte...,Kako su se razvijale kodne strane na računarim...,"[ASCII, kodne strane, PC437, jezici, karakteri]","{'long_query': 3, 'medium_query': 5, 'short_qu..."


In [6]:
inputs = convert_dataset(data, QueryType.LONG.value)

In [10]:
for i in inputs:
    print(i)

<InputExample> label: 0.6, texts:  Kao da je nečastivi bio u meni tako se pomamim, kad vidim čoveka pred vratima od košare, koja su bila otvorena — a ja sam ih mojom rukom zaključao na bravu, pre nego što sam ušao večerati; Kako se osećao autor kada je video čoveka pred vratima od košare koje je zaključao?
<InputExample> label: 0.4, texts:  „Jesam ga s obadve ruke obuhvatio i rastrgao bih ga kao jastreb tiče, da se u taj čas drugo opet nije dogodilo“ „Što to?“ „Kako sam ga lako o zemlju spustio, mislio sam da su mu se izmrvile sve koščice — jest da, nije tako lako umoriti pasji rod: u glavu treba besnu vašku, jer će te ujesti i onda kad si je u polak utukao; Kako je autor reagovao nakon što je rastrgao psa i šta je primetio o njegovoj reakciji?
<InputExample> label: 1.0, texts:  Zatrčim se k njemu, no, čini mi se, on se nadao tome kolaču, jer me dočeka s jednom drenovačom  te me lupi u po glave Sam me je Bog sačuvao i sveti njegov dan, koji slavismo, što mrtav ne padoh; Kako je izgleda

In [15]:
ddd = convert_to_hf_dataset(inputs)
ddd

Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 16
})

In [26]:
for t in ddd['score']:
    print(type(t))

<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>
<class 'float'>


In [28]:
tr, dev, ev, _ = get_train_and_eval_datasets(dataset_name="datasets/TRAIN11k.parquet")

In [29]:
tr

Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 9013
})

In [25]:
from pprint import pprint

pprint(tr['sentence1'][2])
pprint(tr['sentence2'][2])
pprint(tr['score'][2])

('Prva vetrenjača pojavila se u Iranu u 9. veku. Iranci su značajno doprineli '
 'razumevanju mnogih naučnih područja poput astronomije, medicine, matematike '
 'i filozofije. Muhamed el Horezmi se smatra ocem algebre.')
('Kada se prva vetrenjača pojavila i kako su Iranci doprineli razvoju nauke u '
 '9. veku?')
0.6


In [31]:
dev

Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 1126
})

In [30]:
ev

Dataset({
    features: ['sentence1', 'sentence2', 'score'],
    num_rows: 1128
})

In [33]:
for i in range(100):
    pprint(tr['sentence1'][i])
    pprint(tr['sentence2'][i])
    pprint(tr['score'][i])

('Treba se u popunosti ponuditi oblikovnoj snazi primera, ali bez dijaloga sa '
 'samim sobom ne možemo stići do suštine Ispitivanje samog sebe dovodi do '
 'udvajanja U jednom pismu Montenj brani svoje pisanje kao da on nije njemu '
 'kriv Razlog pisanju nalazi se u tmini tela, crnoj žuči melanholije koja '
 'potiskuje volju Jedini voljni izbor Montenja je njegova samoća, jednaka '
 'slobodi Ispitivanje samog sebe dovodi do udvajanja U jednom pismu Montenj '
 'brani svoje pisanje kao da on nije njemu kriv Razlog pisanju nalazi se u '
 'tmini tela, crnoj žuči melanholije koja potiskuje volju')
'Kako Montenj opisuje svoju volju i samoću u kontekstu pisanja?'
0.6
(', 1995) Pored toga, kvasac se u fabrikama etanola prodaje kao nusproizvod '
 'čime se povećava ekonomičnost procesa proizvodnje etanola (Ge i sar, 2006) '
 'Obzirom da je dobro poznato da kvasac predstavlja vredan '
 'proteinsko-vitaminski dodatak u ishrani stoke, ispitan je sadrđaj proteina u '
 'sirovim SRŠR, kao i u iskoriš

In [35]:
from transformers import BertModel

# Load the model
model_name = "jerteh/Jerteh-355"  # Replace this with the correct model identifier
model = BertModel.from_pretrained(model_name)
model = model.half()
# Access the max sequence length
max_seq_length = model.config.max_position_embeddings

print(f"The maximum sequence length for the model is: {max_seq_length}")

You are using a model of type roberta to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at jerteh/Jerteh-355 and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.

The maximum sequence length for the model is: 514
