In [None]:
import pandas as pd
import os 

In [None]:
output_directory = './hf_output'
epochs = 3
batch_size = 32

input_column = "receipt_text"
label_column = "coicop_level_1"
evaluation_function = "f1"
evaluation_strategy="epoch"

In [None]:
data_directory = "/netappdata/ssi_tdjg/data/ssi/"
features_directory = os.path.join(data_directory, "feature_extraction")

In [None]:
hf_labse_features_filename = os.path.join(features_directory, "ssi_hf_labse_unique_values.parquet")

In [None]:
hf_labse_features = pd.read_parquet(hf_labse_features_filename, engine="pyarrow")
hf_labse_features = hf_labse_features[[input_column, label_column]]
hf_labse_features.head()

In [None]:
# From: https://huggingface.co/docs/transformers/training

from typing import Tuple
from sklearn.model_selection import train_test_split
from datasets import Dataset


def split_data(dataframe: pd.DataFrame, coicop_level: str = "coicop_level_1", test_size: float = 0.2, random_state: int = 42) -> Tuple[Dataset, Dataset]:
    train_dataframe, test_dataframe = train_test_split(dataframe, test_size=test_size, stratify=dataframe[coicop_level], random_state=random_state)
    
    train_dataframe["label"] = train_dataframe[coicop_level]
    train_df = Dataset.from_pandas(train_dataframe)  
    train_df = train_df.class_encode_column("label")
    
    test_dataframe["label"] = test_dataframe[coicop_level]
    test_df = Dataset.from_pandas(test_dataframe)
    test_df = test_df.class_encode_column("label")
    
    return train_df, test_df

In [None]:
train_df, test_df = split_data(hf_labse_features, coicop_level=label_column)

In [None]:
from transformers import AutoTokenizer
from functools import partial

model_name = "sentence-transformers/LaBSE"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(data, text_column: str = "receipt_text", padding: str = "max_length", truncation=True):
    receipt_texts = data[text_column]
    tokens = tokenizer(receipt_texts, padding="max_length")
    return tokens

map_function = partial(tokenize_function, text_column=input_column)

train_df = train_df.map(map_function, batched=True)
test_df = test_df.map(map_function, batched=True)

In [None]:
train_df = train_df.remove_columns([input_column])
test_df = test_df.remove_columns([input_column])

In [None]:
train_df

In [None]:
test_df

In [None]:
number_of_categories = hf_labse_features[label_column].nunique()
number_of_categories

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=number_of_categories)

In [None]:
from transformers import TrainingArguments


training_args = TrainingArguments(
    output_dir=output_directory, 
    evaluation_strategy=evaluation_strategy,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size
)

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:

import evaluate

metric = evaluate.load(evaluation_function)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=test_df,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()