In [1]:
import torch
print(torch.__version__)

2.5.1+cu124


In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [4]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [5]:
from transformers import  AutoTokenizer, TrainingArguments, Trainer , AutoModelForQuestionAnswering
from datasets import load_dataset
import evaluate
import numpy as np

In [6]:
# 1. Download Pretrained Model and Tokenizer
model_name = "distilbert/distilbert-base-cased-distilled-squad"  # Or "bert-base-uncased" for TinyBERT, but DistilBERT for QA
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

In [7]:
# 2. Download ScienceQA Dataset (Text Only)
scienceqa_train = load_dataset("derek-thomas/ScienceQA", "default", split="train", streaming=True)
scienceqa_validation = load_dataset("derek-thomas/ScienceQA", "default", split="validation", streaming=True)

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

In [8]:
# from transformers import DistilBertModel
# model_name = "distilbert-base-uncased"
# model = DistilBertModel.from_pretrained(model_name, torch_dtype=torch.float16, attn_implementation="sdpa")

In [9]:
# 3. Data Preprocessing for Question Answering (Modified for Streaming)
def preprocess_function(examples):  # No longer batched
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = []
    for i, offsets in enumerate(offset_mapping):
        answer = examples["answer"][i]
        start_char = answer["text"][0]
        end_char = answer["text"][-1]
        start_token_idx = None
        end_token_idx = None
        for idx, (start, end) in enumerate(offsets):
            if start <= start_char <= end:
                start_token_idx = idx
            if start <= end_char <= end:
                end_token_idx = idx
        if start_token_idx is None or end_token_idx is None:
            answers.append({'start_positions': 0, 'end_positions': 0})
            continue
        answers.append({
            'start_positions': start_token_idx,
            'end_positions': end_token_idx,
        })
    inputs.update(answers)
    return inputs

In [10]:
import accelerate

accelerate.__version__

'1.3.0'

In [16]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./scienceqa-qa",  # Output directory
    evaluation_strategy="epoch",  # Evaluation strategy
    per_device_train_batch_size=8,  # Batch size (adjust based on resources)
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Number of training epochs
    learning_rate=5e-5,  # Learning rate
    weight_decay=0.01,  # Weight decay
    warmup_steps=500,  # Warmup steps
    fp16=True,  # Use mixed precision training if GPU supports it
    push_to_hub=False, # Set to True to push to hub
    max_steps=5
)



In [17]:
# 4. Fine-tuning and Evaluation Metrics
metric = evaluate.load("squad")  # Use the SQuAD metric

def compute_metrics(p):
    start_logits, end_logits = p.predictions
    start_positions = p.label_ids[0]
    end_positions = p.label_ids[1]
    start_pred = np.argmax(start_logits, axis=-1)
    end_pred = np.argmax(end_logits, axis=-1)

    # Need to convert to a format that metric expects
    formatted_predictions = []
    formatted_references = []

    for i in range(len(start_positions)):
        prediction = {'prediction_text': tokenizer.decode(start_pred[i], end_pred[i]), 'id': str(i)}
        reference = {'answers': [{'text': tokenizer.decode(start_positions[i], end_positions[i]), 'answer_start': 0}], 'id': str(i)}
        formatted_predictions.append(prediction)
        formatted_references.append(reference)

    return metric.compute(predictions=formatted_predictions, references=formatted_references)


In [18]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=scienceqa_train.map(
        preprocess_function, batched=True, remove_columns=scienceqa_train.column_names, batch_size=10 # Add batch size here
    ),  # <--- MAP CALL ADDED HERE (TRAIN)
    eval_dataset=scienceqa_validation.map(
        preprocess_function, batched=True, remove_columns=scienceqa_validation.column_names, batch_size=10 # Add batch size here
    ),  # <--- MAP CALL ADDED HERE (VALIDATION)
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [19]:
# 4. Fine-tuning and Evaluation Metrics
metric = evaluate.load("squad")  # Use the SQuAD metric

In [20]:
def compute_metrics(p):
    start_logits, end_logits = p.predictions
    start_positions = p.label_ids[0]
    end_positions = p.label_ids[1]
    start_pred = np.argmax(start_logits, axis=-1)
    end_pred = np.argmax(end_logits, axis=-1)

    # Need to convert to a format that metric expects
    formatted_predictions = []
    formatted_references = []

    for i in range(len(start_positions)):
        prediction = {'prediction_text': tokenizer.decode(start_pred[i], end_pred[i]), 'id': str(i)}
        reference = {'answers': [{'text': tokenizer.decode(start_positions[i], end_positions[i]), 'answer_start': 0}], 'id': str(i)}
        formatted_predictions.append(prediction)
        formatted_references.append(reference)

    return metric.compute(predictions=formatted_predictions, references=formatted_references)

In [21]:
# Fine-tune the model
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:


Abort: 

In [None]:
# Save the fine-tuned model
trainer.save_model("./scienceqa-qa-fine-tuned") #save model