In [9]:
from transformers import BertForQuestionAnswering, BertTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
import torch
from torch.utils.data import Dataset

# Load the pre-trained model and tokenizer
model_name = "bert-base-uncased"
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm





Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Custom Dataset Class

# Load the dataset
df = pd.read_csv("squad_QA_data.csv")  # Ensure this file has 'context', 'question', 'answers'


class QADataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        example = self.data.iloc[idx]

        inputs = self.tokenizer(
            example["question"],
            example["context"],
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt",
        )

        # Ensure answers are correctly structured
        answer_start = example["answer"][0]  # Access the first answer
        answer_text = example["answer"][0]  # Access the answer text
        start_positions = torch.tensor(answer_start)
        end_positions = torch.tensor(answer_start + len(answer_text))

        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "start_positions": start_positions,
            "end_positions": end_positions,
        }



In [29]:
# Create datasets
train_dataset = QADataset(df, tokenizer)

# Use DataCollatorWithPadding to handle dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",  # Disable evaluation
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

trainer.train()




TypeError: new(): invalid data type 'str'