In [None]:
!pip install evaluate -q

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
import evaluate
from transformers import logging

# Silence warnings for cleaner output
logging.set_verbosity_error()

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda


In [None]:
# Load the SQuAD v1.1 dataset
dataset = load_dataset("squad")

# Take a small subset for quick experimentation
train_size, val_size = 2000, 500
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(train_size))
small_val_dataset = dataset["validation"].shuffle(seed=42).select(range(val_size))

print("Dataset splits:", dataset.keys())
print("Example training record:", small_train_dataset[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset splits: dict_keys(['train', 'validation'])
Example training record: {'id': '573173d8497a881900248f0c', 'title': 'Egypt', 'context': 'The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to a 2010 Pew Global Attitudes survey, 84% of Egyptians polled supported the death penalty for those who leave Islam; 77% supported whippings and cutting off of hands for theft and robbery; and 82% support stoning a person who commits adultery.', 'question': 'What percentage of Egyptians polled support death penalty for those leaving Islam?', 'answers': {'text': ['84%'], 'answer_start': [468]}}


In [None]:
# Load GPT-2 tokenizer
gpt2_model_name = "gpt2"
gpt2_tokenizer = AutoTokenizer.from_pretrained(gpt2_model_name)

# Add a padding token (GPT-2 does not have one)
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

# Preprocessing: concatenate question + context, target = answer
def preprocess_gpt2(examples):
    inputs = ["question: " + q + " context: " + c for q, c in zip(examples["question"], examples["context"])]
    targets = [a["text"][0] for a in examples["answers"]]  # take the first answer

    model_inputs = gpt2_tokenizer(
        inputs, text_target=targets,
        max_length=256, truncation=True, padding="max_length"
    )

    # Manually pad labels with -100 for ignored positions
    if "labels" in model_inputs:
        labels = model_inputs["labels"]
        for i in range(len(labels)):
            labels[i] = labels[i] + [-100] * (256 - len(labels[i])) if len(labels[i]) < 256 else labels[i][:256]
        model_inputs["labels"] = labels

    return model_inputs

# Apply preprocessing
train_gpt2 = small_train_dataset.map(preprocess_gpt2, batched=True, remove_columns=dataset["train"].column_names)
val_gpt2 = small_val_dataset.map(preprocess_gpt2, batched=True, remove_columns=dataset["validation"].column_names)

print("Sample tokenized GPT-2 input:")
print(gpt2_tokenizer.decode(train_gpt2[0]["input_ids"][:100]))


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Sample tokenized GPT-2 input:
question: What percentage of Egyptians polled support death penalty for those leaving Islam? context: The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to a 2010 Pew Global Attitudes survey, 84


In [None]:
print("Input IDs and Labels dimensions for first 10 samples:")
for i in range(10):
    print(f"Sample {i}: Input IDs length = {len(train_gpt2[i]['input_ids'])}, Labels length = {len(train_gpt2[i]['labels'])}")


Input IDs and Labels dimensions for first 10 samples:
Sample 0: Input IDs length = 256, Labels length = 256
Sample 1: Input IDs length = 256, Labels length = 256
Sample 2: Input IDs length = 256, Labels length = 256
Sample 3: Input IDs length = 256, Labels length = 256
Sample 4: Input IDs length = 256, Labels length = 256
Sample 5: Input IDs length = 256, Labels length = 256
Sample 6: Input IDs length = 256, Labels length = 256
Sample 7: Input IDs length = 256, Labels length = 256
Sample 8: Input IDs length = 256, Labels length = 256
Sample 9: Input IDs length = 256, Labels length = 256


In [None]:
# Data collator
data_collator_gpt2 = DataCollatorForLanguageModeling(tokenizer=gpt2_tokenizer, mlm=False)

# Load GPT-2 model
gpt2_model = AutoModelForCausalLM.from_pretrained(gpt2_model_name)

# Training arguments (compatible with your version)
training_args_gpt2 = TrainingArguments(
    output_dir="./gpt2-squad",
    eval_strategy="steps",              # <- fixed here
    eval_steps=100,
    logging_steps=100,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    learning_rate=5e-5,                 # good default for GPT-2 finetuning
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=1,
    warmup_steps=50,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),
    report_to=[],                       # disable wandb / tensorboard
)

# Trainer
trainer_gpt2 = Trainer(
    model=gpt2_model,
    args=training_args_gpt2,
    train_dataset=train_gpt2,
    eval_dataset=val_gpt2,
    data_collator=data_collator_gpt2,
)

# Train GPT-2
# trainer_gpt2.train()



In [None]:
# Test the trained (or pre-trained) GPT-2 with a QA example
sample_input = "question: Who wrote the novel '1984'? context: The novel '1984' was written by George Orwell in 1949."
inputs = gpt2_tokenizer(sample_input, return_tensors="pt").to(device)

outputs = gpt2_model.generate(
    inputs["input_ids"],
    max_length=50,
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95
)

print("Generated Answer:", gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True))


Generated Answer: question: Who wrote the novel '1984'? context: The novel '1984' was written by George Orwell in 1949. He was only 21-23, but he had never completed his thesis before he died. The author's name was George Orwell.


In [None]:
!pip install evaluate -q

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    DefaultDataCollator,
    Trainer,
    TrainingArguments,
)
import evaluate
from transformers import logging

logging.set_verbosity_error()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
# Load SQuAD v1.1
dataset = load_dataset("squad")

# Take smaller subset for faster experimentation
train_size, val_size = 2000, 500
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(train_size))
small_val_dataset = dataset["validation"].shuffle(seed=42).select(range(val_size))

print("Example training record:", small_train_dataset[0])


Example training record: {'id': '573173d8497a881900248f0c', 'title': 'Egypt', 'context': 'The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to a 2010 Pew Global Attitudes survey, 84% of Egyptians polled supported the death penalty for those who leave Islam; 77% supported whippings and cutting off of hands for theft and robbery; and 82% support stoning a person who commits adultery.', 'question': 'What percentage of Egyptians polled support death penalty for those leaving Islam?', 'answers': {'text': ['84%'], 'answer_start': [468]}}


In [None]:
bert_model_name = "bert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

max_length = 384
doc_stride = 128

def preprocess_bert(examples):
    inputs = bert_tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_mapping = inputs.pop("overflow_to_sample_mapping")

    start_positions, end_positions = [], []

    for i, offsets in enumerate(offset_mapping):
        input_ids = inputs["input_ids"][i]
        cls_index = input_ids.index(bert_tokenizer.cls_token_id)

        sequence_ids = inputs.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        answer_start = answers["answer_start"][0]
        answer_text = answers["text"][0]
        answer_end = answer_start + len(answer_text)

        # Find start token
        start_pos, end_pos = 0, 0
        if len(offsets) == 0:
            start_pos = cls_index
            end_pos = cls_index
        else:
            for idx, (start, end) in enumerate(offsets):
                if start <= answer_start < end:
                    start_pos = idx
                if start < answer_end <= end:
                    end_pos = idx
        start_positions.append(start_pos)
        end_positions.append(end_pos)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

train_bert = small_train_dataset.map(preprocess_bert, batched=True, remove_columns=dataset["train"].column_names)
val_bert = small_val_dataset.map(preprocess_bert, batched=True, remove_columns=dataset["validation"].column_names)

print("Keys after preprocessing:", train_bert.column_names)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Keys after preprocessing: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']


In [None]:
# Data collator
data_collator_bert = DefaultDataCollator()

# Load BERT for Question Answering
bert_model = AutoModelForQuestionAnswering.from_pretrained(bert_model_name)

# Updated training arguments
training_args_bert = TrainingArguments(
    output_dir="./bert-squad",
    eval_strategy="steps",     # <- correct param name
    eval_steps=200,                  # evaluate every 200 steps
    logging_steps=100,               # log every 100 steps
    per_device_train_batch_size=8,   # larger batch for faster convergence
    per_device_eval_batch_size=8,
    num_train_epochs=2,              # train for 2 epochs
    learning_rate=3e-5,              # explicitly set LR
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=1,
    warmup_steps=100,                # slightly longer warmup
    fp16=torch.cuda.is_available(),  # mixed precision if GPU supports
    report_to=[],
)

# Trainer
trainer_bert = Trainer(
    model=bert_model,
    args=training_args_bert,
    train_dataset=train_bert,
    eval_dataset=val_bert,
    tokenizer=bert_tokenizer,
    data_collator=data_collator_bert,
)

# Train BERT
trainer_bert.train()



  trainer_bert = Trainer(


{'loss': 5.0306, 'grad_norm': 23.749914169311523, 'learning_rate': 2.97e-05, 'epoch': 0.3968253968253968}
{'loss': 2.8066, 'grad_norm': 25.00146484375, 'learning_rate': 2.264851485148515e-05, 'epoch': 0.7936507936507936}
{'eval_loss': 2.319882869720459, 'eval_runtime': 2.7624, 'eval_samples_per_second': 183.174, 'eval_steps_per_second': 23.168, 'epoch': 0.7936507936507936}
{'loss': 2.119, 'grad_norm': 38.59650802612305, 'learning_rate': 1.5222772277227723e-05, 'epoch': 1.1904761904761905}
{'loss': 1.6015, 'grad_norm': 26.141923904418945, 'learning_rate': 7.797029702970298e-06, 'epoch': 1.5873015873015874}
{'eval_loss': 1.8514269590377808, 'eval_runtime': 2.8186, 'eval_samples_per_second': 179.523, 'eval_steps_per_second': 22.707, 'epoch': 1.5873015873015874}
{'loss': 1.4907, 'grad_norm': 29.81026840209961, 'learning_rate': 3.712871287128713e-07, 'epoch': 1.9841269841269842}
{'train_runtime': 135.8508, 'train_samples_per_second': 29.68, 'train_steps_per_second': 3.71, 'train_loss': 2.60

TrainOutput(global_step=504, training_loss=2.6010605872623502, metrics={'train_runtime': 135.8508, 'train_samples_per_second': 29.68, 'train_steps_per_second': 3.71, 'train_loss': 2.6010605872623502, 'epoch': 2.0})

In [None]:
metric = evaluate.load("squad")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # predictions are (start_logits, end_logits)
    start_preds = predictions[0].argmax(-1)
    end_preds = predictions[1].argmax(-1)

    # Convert to text answers
    pred_texts = []
    label_texts = []

    for i in range(len(start_preds)):
        input_ids = val_bert[i]["input_ids"]
        start, end = int(start_preds[i]), int(end_preds[i])
        pred_text = bert_tokenizer.decode(input_ids[start:end+1])
        pred_texts.append(pred_text)

        label_texts.append(val_small_dataset[i]["answers"]["text"][0])

    return metric.compute(predictions=pred_texts, references=label_texts)

# Example usage after training:
# results = trainer_bert.evaluate()
# print("BERT QA Results:", results)


In [None]:
question = "Who wrote the novel '1984'?"
context = "The novel '1984' was written by George Orwell in 1949."

inputs = bert_tokenizer(question, context, return_tensors="pt").to(device)
outputs = bert_model(**inputs)

start_idx = torch.argmax(outputs.start_logits)
end_idx = torch.argmax(outputs.end_logits) + 1

answer = bert_tokenizer.convert_tokens_to_string(
    bert_tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_idx:end_idx])
)

print("Predicted Answer:", answer)


Predicted Answer: george orwell


In [None]:
!pip install evaluate -q

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
)
import evaluate
from transformers import logging

logging.set_verbosity_error()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
# Load SQuAD v1.1 dataset
dataset = load_dataset("squad")

# Smaller subset for quick runs
train_size, val_size = 2000, 500
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(train_size))
small_val_dataset = dataset["validation"].shuffle(seed=42).select(range(val_size))

print("Example training record:", small_train_dataset[0])


Example training record: {'id': '573173d8497a881900248f0c', 'title': 'Egypt', 'context': 'The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to a 2010 Pew Global Attitudes survey, 84% of Egyptians polled supported the death penalty for those who leave Islam; 77% supported whippings and cutting off of hands for theft and robbery; and 82% support stoning a person who commits adultery.', 'question': 'What percentage of Egyptians polled support death penalty for those leaving Islam?', 'answers': {'text': ['84%'], 'answer_start': [468]}}


In [None]:
t5_model_name = "t5-small"
t5_tokenizer = AutoTokenizer.from_pretrained(t5_model_name)

def preprocess_t5(examples):
    # Input: "question: ... context: ..."
    inputs = ["question: " + q + " context: " + c for q, c in zip(examples["question"], examples["context"])]
    targets = [a["text"][0] for a in examples["answers"]]

    model_inputs = t5_tokenizer(
        inputs, max_length=512, truncation=True, padding="max_length"
    )

    labels = t5_tokenizer(
        targets, max_length=64, truncation=True, padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_t5 = small_train_dataset.map(preprocess_t5, batched=True, remove_columns=dataset["train"].column_names)
val_t5 = small_val_dataset.map(preprocess_t5, batched=True, remove_columns=dataset["validation"].column_names)

print("Sample tokenized T5 input:")
print(t5_tokenizer.decode(train_t5[0]["input_ids"][:100]))


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Sample tokenized T5 input:
question: What percentage of Egyptians polled support death penalty for those leaving Islam? context: The Pew Forum on Religion & Public Life ranks Egypt as the fifth worst country in the world for religious freedom. The United States Commission on International Religious Freedom, a bipartisan independent agency of the US government, has placed Egypt on its watch list of countries that require close monitoring due to the nature and extent of violations of religious freedom engaged in or tolerated by the government. According to 


In [None]:
# Data collator
data_collator_t5 = DataCollatorForSeq2Seq(tokenizer=t5_tokenizer, model=t5_model)

# Load T5 model
t5_model = AutoModelForSeq2SeqLM.from_pretrained(t5_model_name)

# Training arguments (compatible with your version)
training_args_t5 = TrainingArguments(
    output_dir="./t5-squad",
    eval_strategy="steps",            # <- fixed here
    eval_steps=100,
    logging_steps=100,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    learning_rate=3e-5,               # good default for T5
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=1,
    warmup_steps=50,
    gradient_accumulation_steps=4,
    fp16=torch.cuda.is_available(),
    report_to=[],  # disable wandb/tensorboard
)

trainer_t5 = Trainer(
    model=t5_model,
    args=training_args_t5,
    train_dataset=train_t5,
    eval_dataset=val_t5,
    tokenizer=t5_tokenizer,
    data_collator=data_collator_t5,
)

# Train (comment if just testing pipeline)
# trainer_t5.train()

  trainer_t5 = Trainer(


In [None]:
question = "Who wrote the novel '1984'?"
context = "The novel '1984' was written by George Orwell in 1949."

input_text = "question: " + question + " context: " + context
inputs = t5_tokenizer(input_text, return_tensors="pt").to(device)

outputs = t5_model.generate(
    inputs["input_ids"],
    max_length=50,
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95
)

print("Predicted Answer:", t5_tokenizer.decode(outputs[0], skip_special_tokens=True))


Predicted Answer: George Orwell


In [None]:
import evaluate

# Load official SQuAD metric (gives F1 and EM)
squad_metric = evaluate.load("squad")


In [None]:
def evaluate_gpt2(num_samples=20):
    preds, refs = [], []
    for i in range(num_samples):
        example = small_val_dataset[i]
        q = example["question"]
        c = example["context"]
        true_ans = example["answers"]

        input_text = f"question: {q} context: {c}"
        inputs = gpt2_tokenizer(input_text, return_tensors="pt").to(device)

        outputs = gpt2_model.generate(
            inputs["input_ids"],
            max_new_tokens=64,
            do_sample=False
        )
        pred_text = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)

        preds.append({"id": str(i), "prediction_text": pred_text})
        refs.append({"id": str(i), "answers": true_ans})

    return squad_metric.compute(predictions=preds, references=refs)

print("GPT-2 Evaluation:", evaluate_gpt2())


GPT-2 Evaluation: {'exact_match': 0.0, 'f1': 3.913101540426598}


In [None]:
def evaluate_bert(num_samples=20):
    preds, refs = [], []
    for i in range(num_samples):
        example = small_val_dataset[i]
        q = example["question"]
        c = example["context"]
        true_ans = example["answers"]

        inputs = bert_tokenizer(q, c, return_tensors="pt", truncation=True, max_length=384).to(device)
        outputs = bert_model(**inputs)

        start_idx = torch.argmax(outputs.start_logits)
        end_idx = torch.argmax(outputs.end_logits) + 1
        pred_text = bert_tokenizer.decode(inputs["input_ids"][0][start_idx:end_idx])

        preds.append({"id": str(i), "prediction_text": pred_text})
        refs.append({"id": str(i), "answers": true_ans})

    return squad_metric.compute(predictions=preds, references=refs)

print("BERT Evaluation:", evaluate_bert())


BERT Evaluation: {'exact_match': 60.0, 'f1': 65.80555555555556}


In [None]:
def evaluate_t5(num_samples=20):
    preds, refs = [], []
    for i in range(num_samples):
        example = small_val_dataset[i]
        q = example["question"]
        c = example["context"]
        true_ans = example["answers"]

        input_text = f"question: {q} context: {c}"
        inputs = t5_tokenizer(input_text, return_tensors="pt").to(device)

        outputs = t5_model.generate(
            inputs["input_ids"],
            max_new_tokens=64,
            do_sample=False
        )
        pred_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

        preds.append({"id": str(i), "prediction_text": pred_text})
        refs.append({"id": str(i), "answers": true_ans})

    return squad_metric.compute(predictions=preds, references=refs)

print("T5 Evaluation:", evaluate_t5())


T5 Evaluation: {'exact_match': 85.0, 'f1': 90.11904761904762}
