In [26]:
import logging
import math
import random
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import List, Tuple

import pandas
import pyarrow.parquet as pq
import sentence_transformers.losses as losses
from datasets import Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.evaluation import (
    EmbeddingSimilarityEvaluator,
    SimilarityFunction,
)
from sentence_transformers.readers import InputExample
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainerCallback, TrainerControl, TrainerState

# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO)


class QueryType(Enum):
    SHORT = "short_query"
    MEDIUM = "medium_query"
    LONG = "long_query"


def load_pandas_df(file: Path) -> pandas.DataFrame:
    loaded_table = pq.read_table(file)
    return loaded_table.to_pandas()


def convert_to_hf_dataset(dataframe: pandas.DataFrame, question_type:str) -> Dataset:
    # Convert each InputExample into a dictionary
    data_dict = {
        "sentence1": [],
        "sentence2": [],
        'score': []
    }
    for inx, row in dataframe.iterrows():
        dataframe_size = len(dataframe)
        context = row['context']
        data_dict['sentence1'].append(context)
        words = context.split(" ")
        positive = " ".join(words[: len(words)//4])
        data_dict['sentence2'].append(positive)
        data_dict['score'].append(1)
        
        negative_indx  = (inx + 10) % dataframe_size
        negative_context = dataframe.iloc[negative_indx]['context']
        words = negative_context.split(" ")
        negative = " ".join(words[: len(words)//4])
        data_dict['sentence1'].append(context)
        data_dict['sentence2'].append(negative)
        data_dict['score'].append(0)
    # Create a Hugging Face Dataset
    return Dataset.from_dict(data_dict)


def sanity_check(train_df, eval_df):
    dataset_counts_train = train_df['dataset'].value_counts()
    dataset_counts_eval = eval_df['dataset'].value_counts()
    dataset_proportions = dataset_counts_train / dataset_counts_train.sum()
    print(dataset_proportions)
    dataset_proportions = dataset_counts_eval / dataset_counts_eval.sum()
    print(dataset_proportions)

def get_train_and_eval_datasets(
    dataset_name: Path,
    question_type:str
) -> Tuple[Dataset, Dataset]:
    
    df = load_pandas_df(file=dataset_name)
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)
    sanity_check(train_df, eval_df)
    # Convert lists to Hugging Face Datasets
    train_dataset = convert_to_hf_dataset(train_df, question_type)
    eval_dataset = convert_to_hf_dataset(eval_df, question_type)

    return train_dataset, eval_dataset

In [27]:
train_dataset, eval_dataset = get_train_and_eval_datasets(
        "datasets/TRAIN11k_fixed_v2.parquet",
        QueryType.SHORT.value
    )

dataset
wiki          0.632590
science       0.180130
news          0.150464
literature    0.036816
Name: count, dtype: float64
dataset
wiki          0.647034
science       0.173709
news          0.145113
literature    0.034144
Name: count, dtype: float64


In [28]:
from pprint import pprint
for i in range(10):
    pprint(train_dataset['sentence1'][i])
    print(train_dataset['sentence2'][i])
    print(train_dataset['score'][i])

('To sredstvo mogu biti sami akteri kao nosioci i prenosioci informacija ali '
 'može biti i okruženje Okruženje može biti zajedničko, neposredno, kada su '
 'akteri u istom prostoru, ali se informacija prenosi i među razdvojenim '
 'okruženjima, f zičkom vezom, putem kojim se kreću prenosioci informacija - '
 'akteri u kretanju kroz prostor, predmeti koje se prenose ili ono što '
 'smatramo čistom informacionom razmenom iako ima određeni f zički okvir: zvuk '
 'koji se prenosi kroz materiju, električni signal u telefonskim provodnicima, '
 'i nosioci elektromagnetnog zračenja u prostoru ili optičkim provodnicima')
To sredstvo mogu biti sami akteri kao nosioci i prenosioci informacija ali može biti i okruženje Okruženje može biti zajedničko, neposredno,
1
('To sredstvo mogu biti sami akteri kao nosioci i prenosioci informacija ali '
 'može biti i okruženje Okruženje može biti zajedničko, neposredno, kada su '
 'akteri u istom prostoru, ali se informacija prenosi i među razdvojenim '
 '