In [1]:
%pip install -q datasets  python-dotenv 

Note: you may need to restart the kernel to use updated packages.


In [11]:
%pip install torch transformers[torch] accelerate

zsh:1: no matches found: transformers[torch]
Note: you may need to restart the kernel to use updated packages.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
from dotenv import load_dotenv
import os

# Specify the path to env.txt
load_dotenv("env.txt")

True

In [16]:
import torch

In [2]:
from datasets import load_dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForQuestionAnswering,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

In [3]:
# Load a tiny dataset subset
dataset = load_dataset("squad", split="train[:100]")  # Only 100 examples
eval_dataset = load_dataset("squad", split="validation[:20]")  # 20 validation examples

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

In [4]:
# Load smaller model and tokenizer
model_name = "distilbert-base-uncased"  # Much smaller than BERT
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForQuestionAnswering.from_pretrained(model_name)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def preprocess_function(examples):
    # Tokenize questions and contexts
    tokenized = tokenizer(
        examples["question"],  # Now passing lists directly
        examples["context"],   # Now passing lists directly
        truncation="only_second",
        max_length=256,
        stride=128,
        return_offsets_mapping=True,
        padding="max_length",
        return_overflowing_tokens=False  # Disable overflow tokens for simplicity
    )
    
    # Initialize answer arrays
    start_positions = []
    end_positions = []
    
    # Process each example in the batch
    for i in range(len(examples["question"])):
        # Get the offset mapping for this example
        offset = tokenized["offset_mapping"][i]
        
        # Get answer for this example
        answer = examples["answers"][i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        
        # Convert char positions to token positions
        start_token = 0
        end_token = 0
        
        # Find the token positions that contain the answer
        for idx, (start, end) in enumerate(offset):
            if start <= start_char and end >= start_char:
                start_token = idx
            if start <= end_char and end >= end_char:
                end_token = idx
                break
        
        start_positions.append(start_token)
        end_positions.append(end_token)
    
    # Add answer positions to tokenized output
    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    
    return tokenized

In [6]:
# Process datasets
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    batch_size=32,  # Explicit batch size
    remove_columns=dataset.column_names,
)

In [7]:
tokenized_eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=32,  # Explicit batch size
    remove_columns=eval_dataset.column_names,
)

In [9]:
# Fast training configuration
training_args = TrainingArguments(
    output_dir="./quick-qa-results",
    num_train_epochs=1,  # Single epoch
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,  # Slightly higher learning rate
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy="no",  # Skip evaluation to save time
    save_strategy="no",  # Don't save checkpoints
    use_cpu=True,  # Force CPU
    report_to="none",  # Disable wandb/tensorboard reporting
)

In [10]:
# Initialize and train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorWithPadding(tokenizer),
)

In [11]:
# Train and save
trainer.train()
model.save_pretrained("./quick-qa-model")
tokenizer.save_pretrained("./quick-qa-model")

Step,Training Loss
10,5.446
20,5.2105


('./quick-qa-model/tokenizer_config.json',
 './quick-qa-model/special_tokens_map.json',
 './quick-qa-model/vocab.txt',
 './quick-qa-model/added_tokens.json',
 './quick-qa-model/tokenizer.json')

In [12]:
def load_qa_model(model_path="./quick-qa-model"):
    # Load model and tokenizer from saved directory
    tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
    model = DistilBertForQuestionAnswering.from_pretrained(model_path)
    return model, tokenizer

In [14]:

def answer_question(question, context, model, tokenizer):
    # Tokenize input
    inputs = tokenizer(
        question,
        context,
        return_tensors="pt",
        max_length=256,
        truncation="only_second",
        padding=True
    )
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Find start and end positions
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits)
    
    # Convert token positions to string
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    answer = tokenizer.convert_tokens_to_string(tokens[answer_start:answer_end + 1])
    
    return answer

In [17]:
model, tokenizer = load_qa_model()

# Example context and question
context = """
Python is a high-level programming language created by Guido van Rossum and released in 1991. 
Python's design emphasizes code readability with its notable use of significant whitespace. 
Its language constructs and object-oriented approach aim to help programmers write clear, logical code.
"""

question = "Who created Python?"

# Get answer
answer = answer_question(question, context, model, tokenizer)
print(f"\nQuestion: {question}")
print(f"Answer: {answer}")


Question: Who created Python?
Answer: 1991
