In [1]:
import pandas as pd
import os 

In [2]:
data_directory = "/netappdata/ssi_tdjg/data/ssi/"
features_directory = os.path.join(data_directory, "feature_extraction")

In [3]:
hf_labse_features_filename = os.path.join(features_directory, "ssi_hf_labse_unique_values.parquet")

In [4]:
coicop_level = "coicop_level_1"

In [5]:
hf_labse_features = pd.read_parquet(hf_labse_features_filename, engine="pyarrow")
hf_labse_features = hf_labse_features[["receipt_text", coicop_level]]
hf_labse_features.head()

Unnamed: 0,receipt_text,coicop_level_1
0,tilda rice,1
1,lassie built,1
2,vliesrijst,1
3,lassie rijst,1
5,pablos quino,1


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

def train_classifier(train_dataframe: pd.DataFrame, clf = RandomForestClassifier(), coicop_level: str = "coicop_level_1"):
    clf.fit(train_dataframe.features.values.tolist(), train_dataframe[coicop_level].values.tolist())
    return clf

def predict(clf, test_dataframe: pd.DataFrame, coicop_level: str = "coicop_level_1") -> pd.DataFrame:
    y_pred = clf.predict(test_dataframe.features.values.tolist())
    return pd.DataFrame(classification_report(test_dataframe[coicop_level].values.tolist(), y_pred, output_dict=True)).transpose()

def train_and_predict(dataframe: pd.DataFrame, clf = RandomForestClassifier(), coicop_level: str = "coicop_level_1", test_size: float = 0.2, random_state: int = 42):
    train_dataframe, test_dataframe = train_test_split(dataframe, test_size=0.2, stratify=dataframe[coicop_level], random_state=random_state)
    clf = train_classifier(train_dataframe, clf, coicop_level)
    return predict(clf, test_dataframe, coicop_level)

In [7]:
# From: https://huggingface.co/docs/transformers/training

from typing import Tuple
from datasets import Dataset


def split_data(dataframe: pd.DataFrame, coicop_level: str = "coicop_level_1", test_size: float = 0.2, random_state: int = 42) -> Tuple[Dataset, Dataset]:
    train_dataframe, test_dataframe = train_test_split(dataframe, test_size=test_size, stratify=dataframe[coicop_level], random_state=random_state)
    
    train_dataframe["label"] = train_dataframe[coicop_level]
    train_df = Dataset.from_pandas(train_dataframe)  
    train_df = train_df.class_encode_column("label")
    
    test_dataframe["label"] = test_dataframe[coicop_level]
    test_df = Dataset.from_pandas(test_dataframe)
    test_df = test_df.class_encode_column("label")
    
    return train_df, test_df

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
train_df, test_df = split_data(hf_labse_features, coicop_level=coicop_level)

Casting to class labels: 100% 219268/219268 [00:00<00:00, 454232.19 examples/s]
Casting to class labels: 100% 54818/54818 [00:00<00:00, 452738.72 examples/s]


In [9]:
from transformers import AutoTokenizer

model_name = "sentence-transformers/LaBSE"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(data, text_column: str = "receipt_text", padding: str = "max_length", truncation=True):
    receipt_texts = data[text_column]
    tokens = tokenizer(receipt_texts, padding="max_length")
    return tokens

train_df = train_df.map(tokenize_function, batched=True)
test_df = test_df.map(tokenize_function, batched=True)

Map: 100% 219268/219268 [00:48<00:00, 4524.75 examples/s]
Map: 100% 54818/54818 [00:12<00:00, 4561.00 examples/s]


In [10]:
train_df = train_df.remove_columns(["receipt_text"])
test_df = test_df.remove_columns(["receipt_text"])

In [11]:
train_df

Dataset({
    features: ['coicop_level_1', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 219268
})

In [12]:
test_df

Dataset({
    features: ['coicop_level_1', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 54818
})

In [13]:
number_of_categories = hf_labse_features[coicop_level].nunique()
number_of_categories

9

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=number_of_categories)

In [None]:
from transformers import TrainingArguments

batch_size = 64
training_args = TrainingArguments(
    output_dir="hf_output", 
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("f1")

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_df,
    eval_dataset=test_df,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()