In [107]:
from transformers import AutoTokenizer, BertForQuestionAnswering
from datasets import load_dataset

# Load the dataset
raw_datasets = load_dataset("pubmed_qa", "pqa_labeled")

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 1000
    })
})

In [108]:
# Load the pre-trained model and tokenizer
model_name = "adsabs/astroBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at adsabs/astroBERT and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [110]:
def tokenize_function(examples):
    return tokenizer(examples["question"],  examples["final_decision"], padding="max_length", truncation=True, max_length=206)

In [111]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [112]:
tokenized_datasets = tokenized_datasets.rename_column("question", "text")
# tokenized_datasets = tokenized_datasets.rename_column("final_decision", "labels")

In [113]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) 
full_train_dataset = tokenized_datasets["train"]

In [114]:
full_train_dataset[10]

{'pubid': 25432938,
 'text': "Did Chile's traffic law reform push police enforcement?",
 'context': {'contexts': ["The objective of the current study is to determine to what extent the reduction of Chile's traffic fatalities and injuries during 2000-2012 was related to the police traffic enforcement increment registered after the introduction of its 2005 traffic law reform.",
   "A unique dataset with assembled information from public institutions and analyses based on ordinary least square and robust random effects models was carried out. Dependent variables were traffic fatality and severe injury rates per population and vehicle fleet. Independent variables were: (1) presence of new national traffic law; (2) police officers per population; (3) number of traffic tickets per police officer; and (4) interaction effect of number of traffic tickets per police officer with traffic law reform. Oil prices, alcohol consumption, proportion of male population 15-24 years old, unemployment, road

In [115]:
full_train_dataset = full_train_dataset.map(
    lambda example: {"label": 1 if example["final_decision"] == "yes" else 0},
    remove_columns=["context"],
)

In [116]:
print(full_train_dataset.column_names)
print(full_train_dataset.features["input_ids"])

['pubid', 'text', 'long_answer', 'final_decision', 'input_ids', 'token_type_ids', 'attention_mask', 'label']
Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None)


In [117]:
from transformers import Trainer, TrainingArguments
# Define training arguments
training_args = TrainingArguments(
    output_dir="./qa_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=500,
    save_total_limit=2,
)

# training_args = TrainingArguments("test_trainer")

In [119]:

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./qa_finetuned")
tokenizer.save_pretrained("./qa_finetuned")

  0%|          | 0/375 [01:19<?, ?it/s]
  0%|          | 0/375 [01:12<?, ?it/s]
  0%|          | 0/375 [00:00<?, ?it/s]

TypeError: BertForQuestionAnswering.forward() got an unexpected keyword argument 'labels'