In [None]:
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import evaluate
import numpy as np

In [None]:
# 1. Download Pretrained Model and Tokenizer
model_name = "distilbert-base-cased-distilled-squad"  # Or "bert-base-uncased" for TinyBERT, but DistilBERT for QA
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [None]:
# 2. Download ScienceQA Dataset (Text Only)
scienceqa = load_dataset("scienceqa", "text")  # Load only the text portion

In [None]:
# 3. Data Preprocessing for Question Answering
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,  # Adjust as needed
        truncation="only_second",  # Truncate context if too long
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = []
    for i, offsets in enumerate(offset_mapping):
        answer = examples["answer"][i]
        start_char = answer["text"][0] # Corrected access
        end_char = answer["text"][-1] # Corrected access
        start_token_idx = None
        end_token_idx = None
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char <= end:
                start_token_idx = idx
            if start <= end_char <= end:
                end_token_idx = idx
        if start_token_idx is None or end_token_idx is None:  # Handle cases where answer is not fully in context
            answers.append({'start_positions': 0, 'end_positions': 0}) # Setting to 0 for CLS token
            continue
        answers.append({
            'start_positions': start_token_idx,
            'end_positions': end_token_idx,
        })
    inputs.update(answers)
    return inputs

In [None]:
processed_scienceqa = scienceqa.map(
    preprocess_function,
    batched=True,
    remove_columns=scienceqa["train"].column_names,
)

In [None]:
# 4. Fine-tuning and Evaluation Metrics
metric = evaluate.load("squad")  # Use the SQuAD metric

In [None]:
def compute_metrics(p):
    start_logits, end_logits = p.predictions
    start_positions = p.label_ids[0]
    end_positions = p.label_ids[1]
    start_pred = np.argmax(start_logits, axis=-1)
    end_pred = np.argmax(end_logits, axis=-1)

    # Need to convert to a format that metric expects
    formatted_predictions = []
    formatted_references = []

    for i in range(len(start_positions)):
        prediction = {'prediction_text': tokenizer.decode(start_pred[i], end_pred[i]), 'id': str(i)}
        reference = {'answers': [{'text': tokenizer.decode(start_positions[i], end_positions[i]), 'answer_start': 0}], 'id': str(i)}
        formatted_predictions.append(prediction)
        formatted_references.append(reference)

    return metric.compute(predictions=formatted_predictions, references=formatted_references)

In [None]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./scienceqa-qa",  # Output directory
    evaluation_strategy="epoch",  # Evaluation strategy
    per_device_train_batch_size=8,  # Batch size (adjust based on resources)
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Number of training epochs
    learning_rate=5e-5,  # Learning rate
    weight_decay=0.01,  # Weight decay
    warmup_steps=500,  # Warmup steps
    fp16=True,  # Use mixed precision training if GPU supports it
    push_to_hub=False, # Set to True to push to hub
)

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_scienceqa["train"],
    eval_dataset=processed_scienceqa["validation"], # Add validation set if available
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
# Save the fine-tuned model
trainer.save_model("./scienceqa-qa-fine-tuned") #save model