# Set the base model checkpoint

The Microsoft BiomedBERT model is well suited for this task, as it is a masked language model trained on PubMed data.

https://huggingface.co/microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract

In [1]:
checkpoint = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract"

# Load the train and eval splits

In [None]:
from datasets import load_dataset

def get_text_and_labels(x: dict) -> dict:
    text = x["title"] + "\n" + "\n".join(x["abstract"])
    return {"text": text, "label": int(x["is_selected"])}

dataset = load_dataset(
    "arrow",
    data_files={
        "train": f"datasets/abstracts.hf/train/data-00000-of-00001.arrow",
        "eval": f"datasets/abstracts.hf/eval/data-00000-of-00001.arrow",
    },
)

dataset = dataset.map(get_text_and_labels)

dataset

# Tokenize the dataset

In [None]:
from transformers import AutoTokenizer

# max length must be explicitely defined here because it is not defined in the checkpoint config.
tokenizer = AutoTokenizer.from_pretrained(checkpoint, model_max_length=512)

tokenize_text = lambda x: tokenizer(x["text"], truncation=True)

dataset = dataset.map(tokenize_text, batched=True)

dataset

# Train the model

The model is trained for a single epoch, as the training split contains all negative examples and repeated positive examples. Evaluation is performed on the evaluation split every 1,000 steps.

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

training_args = TrainingArguments(
    output_dir="training-logs",
    eval_steps=1000,
    eval_strategy="steps",
    per_device_eval_batch_size=16,
    per_device_train_batch_size=16,
    num_train_epochs=1,
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["eval"],
    processing_class=tokenizer,
)

trainer.train()

trainer.save_model("trained-model")