# Load the datasets

In [None]:
from datasets import load_dataset

datasets = []

for i in range(5):
  datasets.append(load_dataset("arrow", data_files={
    "train": f"datasets/dataset_balanced{i + 1}.hf/train/data-00000-of-00001.arrow",
    "validation": f"datasets/dataset_balanced{i + 1}.hf/validation/data-00000-of-00001.arrow",
    "test": f"datasets/dataset_balanced{i + 1}.hf/test/data-00000-of-00001.arrow",
  }))

# Prepare the datasets

In [None]:
def get_text_and_labels (x: dict) -> dict:
  text = x["title"] + "\n" + "\n".join(x["abstract"])
  return {"text": text, "label": int(x["is_selected"])}

datasets = [dataset.map(get_text_and_labels) for dataset in datasets]

datasets[0]

# Train the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

checkpoint = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_text(x):
  return tokenizer(x["text"], truncation=True)

dataset = datasets[0].map(tokenize_text, batched=True)

training_args = TrainingArguments("test-trainer")

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
  model,
  training_args,
  train_dataset=dataset["train"],
  eval_dataset=dataset["validation"],
  processing_class=tokenizer,
)

trainer.train()
