In [3]:
!pip install datasets
!pip install scikit-learn

from transformers import BertTokenizer, BertForQuestionAnswering, AdamW
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

# Load SQuAD 1.1 dataset
squad = load_dataset("squad")

# Tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")


Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading scipy-1.13.1-cp39-cp39-macosx_10_9_x86_64.whl (39.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.4/39.4 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Preprocessing function
def preprocess(batch):
    inputs = tokenizer(
        batch["question"],
        batch["context"],
        truncation=True,
        padding="max_length",
        max_length=384,
    )
    
    start_positions = []
    end_positions = []
    
    for answers in batch["answers"]:  # Each "answers" is a dictionary
        # Extract start position and compute end position
        if len(answers["answer_start"]) > 0:
            start = answers["answer_start"][0]
            end = start + len(answers["text"][0]) - 1
        else:  # No valid answer in the current example
            start = 0
            end = 0
        
        start_positions.append(start)
        end_positions.append(end)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [10]:
# Apply preprocessing
train_data = squad["train"].map(preprocess, batched=True)
val_data = squad["validation"].map(preprocess, batched=True)


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [11]:
# Dataloader
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16)


In [12]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [20]:

for epoch in range(3):  # Number of epochs
    model.train()
    for batch in train_loader:
        # Ensure all inputs are converted to tensors and moved to the device
        inputs = {key: torch.tensor(val, dtype=torch.long).to(device) for key, val in batch.items() if key in ["input_ids", "attention_mask"]}
        labels = {key: torch.tensor(val, dtype=torch.long).to(device) for key, val in batch.items() if key in ["start_positions", "end_positions"]}

        # Zero out gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(**inputs, start_positions=labels["start_positions"], end_positions=labels["end_positions"])

        # Compute loss and backpropagate
        loss = outputs.loss
        loss.backward()
        optimizer.step()





TypeError: only integer tensors of a single element can be converted to an index

In [None]:
# Evaluation loop
model.eval()
exact_matches = []
for batch in val_loader:
    with torch.no_grad():
        inputs = {key: torch.tensor(val).to(device) for key, val in batch.items() if key in ["input_ids", "attention_mask"]}
        labels = batch["answers"]  # Extract ground-truth answers
        outputs = model(**inputs)
        
        # Get predicted start and end logits
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
        
        # Get the predicted start and end positions
        start_positions = torch.argmax(start_logits, dim=1).tolist()
        end_positions = torch.argmax(end_logits, dim=1).tolist()
        
        # Convert predictions to text spans
        input_ids = inputs["input_ids"].tolist()
        predicted_answers = [
            tokenizer.decode(input_id[start:end + 1], skip_special_tokens=True)
            for input_id, start, end in zip(input_ids, start_positions, end_positions)
        ]
        
        # Normalize predictions and ground truths
        for prediction, ground_truths in zip(predicted_answers, labels):
            ground_truth_texts = [gt_text.strip() for gt_text in ground_truths["text"]]
            exact_matches.append(
                any(normalize_text(prediction) == normalize_text(gt) for gt in ground_truth_texts)
            )

# Calculate and print EM score
em_score = sum(exact_matches) / len(exact_matches) * 100
print(f"Exact Match (EM) Score: {em_score:.2f}%")
