In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install transformers datasets evaluate



In [3]:
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch
import math
import evaluate

2025-07-09 14:26:12.781344: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752071172.804473     166 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752071172.811557     166 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

# Tokenize the whole dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

In [7]:
tokenized_datasets["test"]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4358
})

In [8]:
block_size = 128

def group_texts(examples):
    # Concatenate all input sequences for each feature (input_ids, attention_mask, etc.)
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    
    total_length = len(concatenated["input_ids"])
    total_length = (total_length // block_size) * block_size  # truncate

    # Split each feature into chunks
    result = {
        k: [concatenated[k][i:i + block_size] for i in range(0, total_length, block_size)]
        for k in concatenated.keys()
    }

    # Duplicate input_ids to labels
    result["labels"] = result["input_ids"].copy()
    return result


# Apply grouping
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

In [9]:
# Load the GPT2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Adjust for pad token

Embedding(50257, 768)

In [10]:
# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [15]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-nextword-wikitext2",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    save_steps=500,
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    weight_decay=0.01,
    report_to="none"
)

In [16]:
# Load perplexity metric
perplexity_metric = evaluate.load("perplexity", module_type="metric")

# Custom top-k accuracy
def top_k_accuracy(logits, labels, k=5):
    """Returns top-k accuracy for next token prediction"""
    # Shift so that tokens <n are predicting token n
    shift_logits = logits[..., :-1, :]
    shift_labels = labels[..., 1:]

    # Get top-k predictions
    top_k = torch.topk(shift_logits, k, dim=-1).indices
    correct = (top_k == shift_labels.unsqueeze(-1)).any(dim=-1)
    return correct.float().mean().item()

# Custom compute_metrics function for Trainer
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    logits = torch.tensor(logits)
    labels = torch.tensor(labels)

    # Perplexity
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    perplexity = math.exp(loss.item())

    # Top-k accuracy
    top5_acc = top_k_accuracy(logits, labels, k=5)

    return {"perplexity": perplexity, "top-5 accuracy": top5_acc}

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [18]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,1.951
200,1.8443
300,1.86
400,1.8483
500,1.8309
600,1.8055
700,1.8302
800,1.8239
900,1.8036
1000,1.8321


TrainOutput(global_step=4667, training_loss=1.7658229604737008, metrics={'train_runtime': 619.4464, 'train_samples_per_second': 30.135, 'train_steps_per_second': 7.534, 'total_flos': 1219384590336000.0, 'train_loss': 1.7658229604737008, 'epoch': 1.0})

In [40]:
model.save_pretrained("gpt2-wikitext-trial-1")

In [41]:
model = GPT2LMHeadModel.from_pretrained("gpt2-wikitext-trial-1")

In [42]:
trainer.evaluate()

OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/parallel/parallel_apply.py", line 96, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/gpt2/modeling_gpt2.py", line 1238, in forward
    loss = self.loss_function(
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/loss/loss_utils.py", line 64, in ForCausalLMLoss
    loss = fixed_cross_entropy(logits, shift_labels, num_items_in_batch, ignore_index, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/loss/loss_utils.py", line 36, in fixed_cross_entropy
    loss = nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction=reduction)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/functional.py", line 3494, in cross_entropy
    return torch._C._nn.cross_entropy_loss(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 50.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 42.12 MiB is free. Process 28529 has 14.70 GiB memory in use. Of the allocated memory 14.12 GiB is allocated by PyTorch, and 122.40 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [44]:
small_eval_dataset = lm_datasets["validation"].select(range(2))
trainer.evaluate(eval_dataset=small_eval_dataset)

{'eval_loss': 2.6255345344543457,
 'eval_perplexity': 13.811846509664377,
 'eval_top-5 accuracy': 0.6811023354530334,
 'eval_runtime': 0.2382,
 'eval_samples_per_second': 8.395,
 'eval_steps_per_second': 4.197,
 'epoch': 1.0}