In [12]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, BertForQuestionAnswering, AdamW
from tqdm.notebook import tqdm
import pandas as pd

# Load the pre-trained BERT tokenizer (using the Fast version)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Example DataLoader for the SQuAD dataset
class SQuADDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data.iloc[idx]['question']
        context = self.data.iloc[idx]['context']
        answer_text = self.data.iloc[idx]['answer_text']
        answer_start = self.data.iloc[idx]['answer_start']

        # Use the fast tokenizer to encode the input with offset mapping
        encoded_input = self.tokenizer.encode_plus(
            question,
            context,
            return_offsets_mapping=True,  # Only available in the Fast tokenizer
            padding='max_length',
            truncation=True,
            max_length=512
        )

        # Find the start and end token indices within the encoded sequence
        offset_mapping = encoded_input['offset_mapping']
        input_ids = encoded_input['input_ids']
        attention_mask = encoded_input['attention_mask']
        token_type_ids = encoded_input['token_type_ids']

        # Convert character positions to token positions
        start_positions = None
        end_positions = None
        for i, (start, end) in enumerate(offset_mapping):
            if start <= answer_start < end:
                start_positions = i
            if start < answer_start + len(answer_text) <= end:
                end_positions = i

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'start_positions': torch.tensor(start_positions, dtype=torch.long),
            'end_positions': torch.tensor(end_positions, dtype=torch.long)
        }

# Load your data into a pandas DataFrame
train_data = pd.DataFrame({
    'question': ["What is the capital of France?", "Who wrote Hamlet?"],
    'context': ["Paris is the capital of France.", "Shakespeare wrote Hamlet."],
    'answer_text': ["Paris", "Shakespeare"],
    'answer_start': [0, 0]
})

# Create dataset and dataloader
train_dataset = SQuADDataset(train_data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=2)

# Load pre-trained BERT model for Question Answering
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 3
model.train()

for epoch in range(num_epochs):
    train_loss = 0
    train_steps = 0
    
    # Training loop with tqdm progress bar
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        start_positions = batch['start_positions']
        end_positions = batch['end_positions']

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            start_positions=start_positions,
            end_positions=end_positions
        )
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_steps += 1

    avg_train_loss = train_loss / train_steps
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_train_loss:.4f}")

print("Training complete.")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1/3 - Loss: 6.0894


Epoch 2/3:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 2/3 - Loss: 5.0559


Epoch 3/3:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 3/3 - Loss: 4.2274
Training complete.


In [13]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_tokenizer')

('./saved_tokenizer/tokenizer_config.json',
 './saved_tokenizer/special_tokens_map.json',
 './saved_tokenizer/vocab.txt',
 './saved_tokenizer/added_tokens.json',
 './saved_tokenizer/tokenizer.json')

In [14]:
def predict(question, context, model, tokenizer):
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt', max_length=512, truncation=True)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    outputs = model(input_ids, attention_mask=attention_mask)
    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # Get the most likely start and end tokens
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores) + 1

    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index])
    )
    return answer

question = "What is the capital of France?"
context = "Paris is the capital of France."
answer = predict(question, context, model, tokenizer)
print(f"Predicted answer: {answer}")


Predicted answer: paris


In [15]:
def token_iou(pred_tokens, true_tokens):
    """
    Calculate the token-level Intersection over Union (IoU) between predicted and true tokens.

    Args:
        pred_tokens (list): List of predicted tokens.
        true_tokens (list): List of true tokens.

    Returns:
        float: The IoU score.
    """
    set_pred = set(pred_tokens)
    set_true = set(true_tokens)
    intersection = set_pred.intersection(set_true)
    union = set_pred.union(set_true)
    iou = len(intersection) / len(union) if len(union) > 0 else 0.0
    return iou


In [16]:
def evaluate_model(model, dataloader, tokenizer):
    model.eval()
    total_iou = 0
    num_batches = len(dataloader)

    for batch in tqdm(dataloader, desc="Evaluating"):
        with torch.no_grad():
            input_ids = torch.stack(batch['input_ids']).squeeze(1).to(model.device)
            attention_mask = torch.stack(batch['attention_mask']).squeeze(1).to(model.device)
            start_positions = torch.tensor(batch['start_positions']).to(model.device)
            end_positions = torch.tensor(batch['end_positions']).to(model.device)

            outputs = model(input_ids, attention_mask=attention_mask)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            # Get the most likely start and end tokens
            start_preds = torch.argmax(start_logits, dim=1)
            end_preds = torch.argmax(end_logits, dim=1)

            for i in range(len(input_ids)):
                pred_tokens = tokenizer.convert_ids_to_tokens(input_ids[i][start_preds[i]:end_preds[i]+1])
                true_tokens = tokenizer.convert_ids_to_tokens(input_ids[i][start_positions[i]:end_positions[i]+1])

                iou = token_iou(pred_tokens, true_tokens)
                total_iou += iou

    avg_iou = total_iou / num_batches
    return avg_iou