In [41]:
import pandas as pd
import os
from dotenv import load_dotenv
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import pandas
import openai
from openai import APIError
import os
import json
import re
import numpy as np
from sklearn.cluster import KMeans
from pprint import pprint
from pathlib import Path
import tiktoken
from typing import List, Dict
from sentence_transformers import SentenceTransformer,  models, util
from sentence_transformers.readers import InputExample
from enum import Enum
from torch.utils.data import DataLoader, random_split
from datetime import datetime
import math
import sentence_transformers.losses  as losses
from sentence_transformers import SentenceTransformerTrainingArguments
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SimilarityFunction
from sentence_transformers import SentenceTransformerTrainer
from sklearn.model_selection import train_test_split


In [42]:
def load_df(file: Path) -> pandas.DataFrame:
    loaded_table = pq.read_table(file)
    return loaded_table.to_pandas()

In [43]:
class QueryType(Enum):
    SHORT = 'short_query'
    MEDIUM = 'medium_query'
    LONG = 'long_query'

In [44]:


def convert_dataset(dataframe: pandas.DataFrame, question_type: str) -> List[InputExample]:
    dataset_samples = []
    for _, row in dataframe.iterrows():
        score = float(row['scores'][question_type]) / 5.0
        sample = InputExample(texts=[row['context'], row[question_type]],
                                 label=score)
        dataset_samples.append(sample)
    return dataset_samples

In [45]:
def get_train_and_eval_datasets():
    df = load_df(file=Path("datasets/train.parquet"))
    training_samples = convert_dataset(df, QueryType.LONG.value)
    # Convert the dataset to a DataLoader ready for training
    # Assuming training_samples is your entire dataset
    dataset_size = len(training_samples)
    train_size = int(0.8 * dataset_size)
    test_size = dataset_size - train_size

    # Split the dataset
    train_dataset, test_dataset = random_split(training_samples, [train_size, test_size])

    # Create DataLoaders
    train_batch_size = 16
    test_batch_size = 16  # You can change this as needed

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=test_batch_size)
    return train_dataloader, test_dataloader

In [46]:
def make_sentence_transformer(model_name :str) -> SentenceTransformer:
    max_seq_length = 128
    word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
    # Apply mean pooling to get one fixed sized sentence vector
    """Performs pooling (max or mean) on the token embeddings.
    Iit generates from a variable sized sentence a fixed sized sentence embedding, 
    allows to use the CLS token if it is returned by the underlying word embedding model.
    We can concatenate multiple poolings together.
    - word_embedding_dimension: Dimensions for the word embeddings
    - pooling_mode_cls_token: Use the first token (CLS token) as text representations
    - pooling_mode_max_tokens: Use max in each dimension over all tokens.
    - pooling_mode_mean_tokens: Perform mean-pooling
    """
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False,
                                pooling_mode_mean_tokens=True)
    return SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [47]:
model_name = "nesto"
model_save_path = 'output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

args = SentenceTransformerTrainingArguments(
        # Required parameter:
        output_dir=model_save_path,
        # Optional training parameters:
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        warmup_ratio=0.1,
        fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
        bf16=False,  # Set to True if you have a GPU that supports BF16
        # batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
        # Optional tracking/debugging parameters:
        eval_strategy="steps",
        eval_steps=100,
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,
        logging_steps=100,
        run_name="proba",  # Will be used in W&B if `wandb` is installed
    )

In [48]:
def train_a_model(model_name:str, args: SentenceTransformerTrainingArguments):
    num_epochs = 10
    train_dataset, eval_dataset = get_train_and_eval_datasets()
    sentence_transformer = make_sentence_transformer(model_name)
    warmup_steps = math.ceil(len(train_dataset) * num_epochs  * 0.1) #10% of train data for warm-up
    train_loss = losses.CosineSimilarityLoss(model=sentence_transformer)
    train_loss = losses.MatryoshkaLoss(sentence_transformer, train_loss, [768, 512, 256, 128, 64])
  

    # # 6. (Optional) Create an evaluator & evaluate the base model
    dev_evaluator = EmbeddingSimilarityEvaluator(
        sentences1=eval_dataset["sentence1"],
        sentences2=eval_dataset["sentence2"],
        scores=eval_dataset["score"],
        main_similarity=SimilarityFunction.COSINE,
        name="sts-dev",
    )

    dev_evaluator(sentence_transformer)

    # 7. Create a trainer & train
    trainer = SentenceTransformerTrainer(
        model=sentence_transformer,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        loss=train_loss,
        evaluator=dev_evaluator,
    )
    trainer.train()

    # # (Optional) Evaluate the trained model on the test set
    test_evaluator = EmbeddingSimilarityEvaluator(
        sentences1=eval_dataset["sentence1"],
        sentences2=eval_dataset["sentence2"],
        scores=eval_dataset["score"],
        main_similarity=SimilarityFunction.COSINE,
        name="sts-dev",
    )
    # test_evaluator(model)

    # 8. Save the trained model
    sentence_transformer.save_pretrained("output/mpnet-base-all-nli-triplet/final")

    # 9. (Optional) Push it to the Hugging Face Hub
    # model.push_to_hub("mpnet-base-all-nli-triplet")

In [49]:
train_a_model("google-bert/bert-base-multilingual-cased", args=args)

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.huggingface.co/bert-base-multilingual-cased/876f584f15ebf14887dec17539c114bb99a032e96b9e72507a51c41e205337fc?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1724065160&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNDA2NTE2MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9iZXJ0LWJhc2UtbXVsdGlsaW5ndWFsLWNhc2VkLzg3NmY1ODRmMTVlYmYxNDg4N2RlYzE3NTM5YzExNGJiOTlhMDMyZTk2YjllNzI1MDdhNTFjNDFlMjA1MzM3ZmM%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=SVy9kmakMObE91dLTlC36XAJtYAupUTXTnGwAwatJ3WL3F-Icezlo%7El1m7i0eDq2RlMd%7E8IZq5goUHFCxumE1BCS92FrVJmSMsvxyy-qLUccy5pmWItn6eyFd2uEeFE5uCXix0jVLWrG-IX5exmm6vLY0SVi5VtsTsyKgU4wwPNYSd2h73Gu%7EKkLj%7ETx81bQdSXwrqKyI3NdLs8xxaraG7r671EHLtIcBveDv0eEaLg6muBPFOV61dPCSESmaJ-0qV%7EOt90wDl4ch1YB7x-j%7E8EAjT9OTjeX4N2Me34IGy-aZvp-CuuWOLpLHqasg22mUVp%7ETpiWQcdUcrsnepWwrQ__

ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'cdn-lfs.huggingface.co\', port=443): Max retries exceeded with url: /bert-base-multilingual-cased/876f584f15ebf14887dec17539c114bb99a032e96b9e72507a51c41e205337fc?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1724065160&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNDA2NTE2MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9iZXJ0LWJhc2UtbXVsdGlsaW5ndWFsLWNhc2VkLzg3NmY1ODRmMTVlYmYxNDg4N2RlYzE3NTM5YzExNGJiOTlhMDMyZTk2YjllNzI1MDdhNTFjNDFlMjA1MzM3ZmM~cmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=SVy9kmakMObE91dLTlC36XAJtYAupUTXTnGwAwatJ3WL3F-Icezlo~l1m7i0eDq2RlMd~8IZq5goUHFCxumE1BCS92FrVJmSMsvxyy-qLUccy5pmWItn6eyFd2uEeFE5uCXix0jVLWrG-IX5exmm6vLY0SVi5VtsTsyKgU4wwPNYSd2h73Gu~KkLj~Tx81bQdSXwrqKyI3NdLs8xxaraG7r671EHLtIcBveDv0eEaLg6muBPFOV61dPCSESmaJ-0qV~Ot90wDl4ch1YB7x-j~8EAjT9OTjeX4N2Me34IGy-aZvp-CuuWOLpLHqasg22mUVp~TpiWQcdUcrsnepWwrQ__&Key-Pair-Id=K3ESJI6DHPFC7 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000277A9E59C90>: Failed to resolve \'cdn-lfs.huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 2341fd9c-480a-4637-abde-31c127718558)')