# Load the test split

In [None]:
from datasets import load_dataset

def get_text_and_labels(x: dict) -> dict:
    text = x["title"] + "\n" + "\n".join(x["abstract"])
    return {"text": text, "label": int(x["is_selected"])}

dataset = load_dataset(
    "arrow",
    data_files={
        "test": "../datasets/abstracts.hf/test/data-00000-of-00001.arrow",
    },
)

dataset = dataset.map(get_text_and_labels)

dataset

# Evaluate the model

The model is evaluated on the test split. The following metrics are computed: accuracy, precision, recall, and F1-score.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from evaluate import evaluator, combine

model = AutoModelForSequenceClassification.from_pretrained("../trained-model")
tokenizer = AutoTokenizer.from_pretrained("../trained-model", model_max_length=512)

task_evaluator = evaluator("text-classification")

eval_result = task_evaluator.compute(
    model_or_pipeline=model,
    data=dataset["test"],
    tokenizer=tokenizer,
    label_mapping={"LABEL_0": 0, "LABEL_1": 1},
    metric=combine(["accuracy", "precision", "recall", "f1"]),
)

print(eval_result)