In [1]:
%pip install torch transformers datasets accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch

from transformers import (
    BertForQuestionAnswering,
    BertTokenizerFast,
    Trainer,
    TrainingArguments,
    default_data_collator
)

In [3]:
from datasets import load_dataset
import numpy as np

In [4]:
# Load a small subset of SQuAD dataset
dataset = load_dataset("squad", split="train[:100]")  # Only 100 examples for quick training
eval_dataset = load_dataset("squad", split="validation[:20]")  # 20 examples for validation

In [5]:
# Initialize tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
# Tokenize function
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]
    
    # Tokenize inputs
    tokenized = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Get answer positions
    start_positions = []
    end_positions = []
    
    for i, offset in enumerate(tokenized.offset_mapping):
        answer = examples['answers'][i]
        start_char = answer['answer_start'][0]
        end_char = start_char + len(answer['text'][0])
        
        # Find start and end token positions
        start_token = None
        end_token = None
        
        for idx, (start, end) in enumerate(offset):
            if start <= start_char < end:
                start_token = idx
            if start <= end_char <= end:
                end_token = idx
                break
                
        if start_token is None:
            start_token = tokenizer.model_max_length
        if end_token is None:
            end_token = tokenizer.model_max_length
            
        start_positions.append(start_token)
        end_positions.append(end_token)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    
    return tokenized

In [7]:
# Preprocess datasets
tokenized_dataset = dataset.map(
    preprocess_function,
    remove_columns=dataset.column_names,
    batched=True
)

In [8]:
tokenized_eval_dataset = eval_dataset.map(
    preprocess_function,
    remove_columns=eval_dataset.column_names,
    batched=True
)

In [9]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert-qa-results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir='./logs',
)



In [10]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=default_data_collator,
)

In [11]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,4.617107
2,No log,4.495646
3,No log,4.510042


TrainOutput(global_step=75, training_loss=3.9665718587239582, metrics={'train_runtime': 302.2093, 'train_samples_per_second': 0.993, 'train_steps_per_second': 0.248, 'total_flos': 58791770265600.0, 'train_loss': 3.9665718587239582, 'epoch': 3.0})

In [12]:
# Save the model
model.save_pretrained("./bert-qa-finetuned")
tokenizer.save_pretrained("./bert-qa-finetuned")

('./bert-qa-finetuned/tokenizer_config.json',
 './bert-qa-finetuned/special_tokens_map.json',
 './bert-qa-finetuned/vocab.txt',
 './bert-qa-finetuned/added_tokens.json',
 './bert-qa-finetuned/tokenizer.json')

In [10]:
from transformers import BertForQuestionAnswering, BertTokenizerFast
import torch

def answer_question(question, context, model_path="./bert-qa-finetuned"):
    # Load the fine-tuned model and tokenizer
    tokenizer = BertTokenizerFast.from_pretrained(model_path)
    model = BertForQuestionAnswering.from_pretrained(model_path)
    
    # Tokenize input
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        max_length=384,
        truncation="only_second",
        padding="max_length"
    )
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the most likely beginning and end of answer
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits)
    
    # Convert tokens to string
    answer_tokens = inputs["input_ids"][0][answer_start:answer_end + 1]
    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
    
    # Get confidence scores
    start_confidence = torch.softmax(outputs.start_logits, dim=1)[0][answer_start].item()
    end_confidence = torch.softmax(outputs.end_logits, dim=1)[0][answer_end].item()
    confidence = (start_confidence + end_confidence) / 2
    
    return {
        "answer": answer,
        "confidence": f"{confidence:.2%}",
        "start": answer_start.item(),
        "end": answer_end.item()
    }

In [11]:
# Example usage
if __name__ == "__main__":
    # Example context and question
    context = """
    Python is a high-level programming language first released in 1991 by Guido van Rossum. 
    It emphasizes code readability with its notable use of significant indentation. 
    Python features a dynamic type system and automatic memory management.
    """
    
    questions = [
        "When was Python first released?",
        "Who created Python?",
        "What does Python emphasize?"
    ]
    
    print("Context:", context.strip(), "\n")
    
    for question in questions:
        print("Question:", question)
        result = answer_question(question, context)
        print("Answer:", result["answer"])
        print("Confidence:", result["confidence"])
        print()

Context: Python is a high-level programming language first released in 1991 by Guido van Rossum. 
    It emphasizes code readability with its notable use of significant indentation. 
    Python features a dynamic type system and automatic memory management. 

Question: When was Python first released?
Answer: 1991
Confidence: 17.18%

Question: Who created Python?
Answer: 1991
Confidence: 16.56%

Question: What does Python emphasize?
Answer: 1991
Confidence: 17.86%

