In [None]:
# This script seeks a better alternative for the current labels used in the FuisionBody.label_embedding_model_body. 
# For this purpose, it evaluattes an alternative embeddings of class descriptions, against the currently implemented default, that embeds label-descriptions.

from fusionsent import FusionSentModel, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import numpy as np
import openai #Please note that openai is not listed in our requirements.txt file. Run $'pip install openai', to install the package.
import torch
import json
import os

In [2]:
# Setting environment variables
cwd = os.path.abspath(os.getcwd())
os.environ['WORLD_SIZE'] = str(torch.cuda.device_count())
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '29500'

# Load and Prepare All Datasets

*1. Download original data.*

In [None]:
# The below are the exact datasets used for training in the original setfit paper.
# If not existent already, we will load them all, and store them locally in order to add label descriptions.
dataset_ids_binary_label: list[str] = ["CR", "emotion", "enron_spam"]
dataset_ids_nonbinary_label: list[str] = ["sst5", "amazon_counterfactual",  "emotion", "ag_news"]
dataset_ids = dataset_ids_binary_label + dataset_ids_nonbinary_label
data_dir_original = "./data/original"
datasets_original = {} 

for dataset_id in dataset_ids:
    print(f"Loading dataset: '{dataset_id}'")
    datasets_original[dataset_id] = {}
    for split in ["train", "test"]:
        try:
            dataset_split = load_dataset(f"SetFit/{dataset_id}", split=split)
            datasets_original[dataset_id][split] = dataset_split
        except ValueError as e:
            print(f"Could not load dataset '{dataset_id}'. An error occurred: {e}")
            datasets_original.pop(dataset_id)
            break
print("-- Done --")

*2. Generate label descriptions via OpenAI and save them to files.*

In [None]:
# ToDo: Fix generation for datasets 'enron_spam', and 'ag_news'.
data_dir_label_descriptions = "./data/label_descriptions"
label_description_file_template = "{}_label_descriptions.json"
os.makedirs(data_dir_label_descriptions, exist_ok=True)

openai_api_key = "your-openai-key"
open_ai_model ="gpt-4-0125-preview"
regenerate = False

def get_label_description(dataset_name: str, label: str, label_text: str, examples: list[str]) -> str:
    try:
        client = openai.OpenAI(api_key=openai_api_key)
        completion = client.chat.completions.create(
            model=open_ai_model,
            messages= [
                {
                    "role": "system", 
                    "content": """
                        You are a scientific research assistant, in the area of Natrual Language Processing.
                        Your purpose is to write comprhesnive, concise, and short descriptions for a given label of a dataset.
                        For each label, you will be provided some examples of data samples that are annoted with the resp. label.
                        Rules:
                        1. Be consise in your descriptions.
                        2. Each decitpion should be exactly one sentence long.
                        Not complying with the rules will result in termination. 
                        """
                },
                {
                    "role": "user",
                    "content": f"""
                        Dataset name: '{dataset_name}'\n
                        Label key: '{label}'\n
                        Label name: '{label_text}'\n
                        ---\n\n
                        Example Samples annotated with '{label_text}':\n\n
                        {examples}\n\n
                        ---\n\n
                        Please describe the essence of the label '{label}': '{label_text}' in one sentence:
                    """
                }
            ]
        )
        if completion.choices and completion.choices[0].message and completion.choices[0].message.content:
            response = completion.choices[0].message.content
            print(f"Obtained description for {dataset_id}/{label_text}: {response}")
            return response
        else:
            raise Exception("Invalid response from OpenAI: No content in the response.")
    except Exception as e:
        raise Exception(f"Unexpected error with the response from OpenAI: {str(e)}")

for dataset_id in dataset_ids:
    description_file_path = os.path.join(data_dir_label_descriptions, label_description_file_template.format(dataset_id))
    if (not regenerate) and os.path.exists(description_file_path):
        print(f"Skipped label generation for '{dataset_id}' dataset (File already exists).")
        continue
    # Samples from SetFit/enron_spam are too large.
    if dataset_id == "enron_spam":
        continue

    # Process the dataset to get label-to-data mapping
    label_to_data = {}
    label_to_label_text = {}
    for item in datasets_original[dataset_id]["train"]:
        label = item['label']
        text = item['text']
        if label not in label_to_data:
            label_to_data[label] = []
        if label not in label_to_label_text:
            label_to_label_text[label] = item["label_text"]
        label_to_data[label].append(text)

    # Sample the 5 examples or less (because of open ai token rate limits) per label and generate label descriptions
    label_to_description = {}
    hasEncounteredError = False
    for label, examples in label_to_data.items():
        sampled_examples: list[str] = np.random.choice(examples, size=5, replace=False).tolist()
        while sum([len(t) for t in sampled_examples]) > 100:
            sampled_examples = sampled_examples[:-1]
        #print(sum([len(t) for t in sampled_examples]))
        try:
            description = get_label_description(dataset_id, label, label_to_label_text[label], examples)
        except Exception as e:
            hasEncounteredError=True
            break
        label_to_description[label] = description

    if hasEncounteredError:
        print(f"An error occurred during label description generation for datatset '{dataset_id}'. Skipping...")
        continue

    # Save the label-to-description mappings
    with open(description_file_path, 'w') as f:
        json.dump(label_to_description, f, indent=2, ensure_ascii=False)
    
    print(f"Saved label descriptions for '{dataset_id}' dataset.")

*3. Format the datasets in order to pass them into the DualSen model*

In [25]:
formatted_datasets = {}
def format_dataset(original_dataset, label_to_description=None) -> Dataset:
    """
    Creates a Dataset object with label encoding and optional label descriptions.
    """
    input_texts = [d['text'] for d in original_dataset]
    raw_labels = [d['label'] for d in original_dataset]

    # Check if labels are binary (single value) or multi-class (list of labels)
    if all(raw_label in [0,1] and not isinstance(raw_label, list) for raw_label in raw_labels):
        # Binary case
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(raw_labels)
    else:
        # Multi-class case
        label_encoder = MultiLabelBinarizer()
        labels = label_encoder.fit_transform([raw_label] for raw_label in raw_labels)

    # Either select label text or label description for the 'label_description' text
    if label_to_description is None:
        label_descriptions = [[d['label_text']] for d in original_dataset]
    else:
        label_descriptions = [[label_to_description[str(d['label'])]] for d in original_dataset]

    # Limit to 250 elements for testing.
    # TODO: Deal with error in setfit.
    #   Error occurrs in setfit.sampler, line 29: 'idxs = np.stack(np.triu_indices(n, k), axis=-1)'
    #   with n being the sample size, k=1 if sampled with replacedmed, 0 otherwise.
    #   Reason: Out-of memory. Latest numpy+setfit versions do not fix this.
    input_texts = input_texts[:250]
    labels = labels[:250]
    label_descriptions = label_descriptions[:250]
    print("Warning: Limiting dataset size to 250 elements for testing!")
    print(labels[0])
    print(label_descriptions[0])
    return Dataset.from_dict({
        "text": input_texts,
        "label": labels,
        "label_description": label_descriptions
    })

for dataset_id in dataset_ids:
    # Load label descriptions
    description_file_path = os.path.join(data_dir_label_descriptions, label_description_file_template.format(dataset_id))
    try:
        with open(description_file_path, 'r') as f:
            label_to_description = json.load(f)
    except FileNotFoundError:
        print(f"Skipping formatting dataset '{dataset_id}': Description file not found.")
    
    # Format train and validation datasets, one with the descriptions in "label_description", and one with the label texts instead.
    try:    
        formatted_datasets[dataset_id] = {}
        formatted_datasets[dataset_id]["label_description"] = {
            "train": format_dataset(datasets_original[dataset_id]["train"], label_to_description),
            "test": format_dataset(datasets_original[dataset_id]["test"], label_to_description)
        }
        formatted_datasets[dataset_id]["label_text"] = {
            "train": format_dataset(datasets_original[dataset_id]["train"]),
            "test": format_dataset(datasets_original[dataset_id]["test"])
        }
        print(f"Sucessfully formatted dataset '{dataset_id}'.")

    except KeyError as e:
        print(f"Skipping formatting dataset '{dataset_id}': Key 'train' and/or 'test' not found.")
        formatted_datasets.pop(dataset_id)

1
["The label '1', denoted as 'positive', applies to data samples expressing favorable, satisfactory, or beneficial opinions, experiences, or outcomes."]
1
["The label '1', denoted as 'positive', applies to data samples expressing favorable, satisfactory, or beneficial opinions, experiences, or outcomes."]
1
['positive']
1
['positive']
Sucessfully formatted dataset 'CR'.
[1 0 0 0 0 0]
["The essence of the label '0': 'sadness' is characterized by feelings of hopelessness, disappointment, melancholy, and vulnerability, often accompanied by a sense of isolation or being overwhelmed."]
[1 0 0 0 0 0]
["The essence of the label '0': 'sadness' is characterized by feelings of hopelessness, disappointment, melancholy, and vulnerability, often accompanied by a sense of isolation or being overwhelmed."]
[1 0 0 0 0 0]
['sadness']
[1 0 0 0 0 0]
['sadness']
Sucessfully formatted dataset 'emotion'.
Skipping formatting dataset 'enron_spam': Description file not found.
1
["The label 'joy' encompasses e

# Train & Evaluate FusionSent Model 

*1. Set up the model, tokenizer, and training arguments.*

In [23]:
model_id = "malteos/scincl"
tokenizer = AutoTokenizer.from_pretrained(model_id)
training_args = TrainingArguments(
        batch_sizes=(10,15),
        num_epochs=(1,3),
        sampling_strategies="undersampling",
        use_setfit_body=False #In this experiment, we only want to evaluate different lavel_embedding submodels, so we dont need the 'setfit' body.
    )

def getFreshModel()->FusionSentModel:
    return FusionSentModel.from_pretrained(pretrained_model_name_or_path=model_id, multi_target_strategy="one-vs-rest")

*2. Train and evaluate one dataset after another.*

*Please choose an appropriate subset of all the datasets in `target_datasets`.*

In [None]:
target_datatsets = dataset_ids[:1] #Select applicable datasets (only first for testing)

for datatset_id in target_datatsets:
    for dataset_key, dataset in formatted_datasets[dataset_id].items():
        # Define Trainer and start training
        trainer = Trainer(
            model=getFreshModel(),
            args=training_args,
            train_dataset=dataset["train"],
            eval_dataset=dataset["test"],
            eval_metrics={
                'metric_names': ['f1', 'precision', 'recall', 'accuracy'],
                'metric_args': {'average': 'micro'}
            }
        )
        print(f"Training FusionSent on dataset '{dataset_id}', with {dataset_key}.")
        trainer.train()
        # Evaluate the current model
        eval_scores = trainer.evaluate(
            x_eval=[item['text'] for item in dataset["test"]],
            y_eval=[item['label'] for item in dataset["test"]]
        )
        print(f"Evaluation results for '{dataset_id}' with {dataset_key}: {eval_scores}")