In [2]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
model.to("cuda") # Model loaded below first
print("Model moved to CUDA successfully.")
print(torch.cuda.memory_allocated(0) / (1024 ** 2))
print(torch.cuda.memory_reserved(0) / (1024 ** 2))

True
NVIDIA GeForce RTX 3050 Laptop GPU
Model moved to CUDA successfully.
487.46875
542.0


In [15]:
from datasets import load_dataset_builder
ds_builder = load_dataset_builder("wikitext", "wikitext-2-v1") # was too large to train so just used for testing

In [16]:
total_size = sum(split.num_bytes for split in ds_builder.info.splits.values()) / (1024 ** 2)
print(f"Total dataset size: ~{total_size:.2f} MB")
print("Dataset description:", ds_builder.info.description)
print("\nFeatures:\n", ds_builder.info.features)
print("\nSplits:\n", list(ds_builder.info.splits.keys()))


Total dataset size: ~12.71 MB
Dataset description: 

Features:
 {'text': Value(dtype='string', id=None)}

Splits:
 ['test', 'train', 'validation']


In [3]:
from datasets import load_dataset_builder
ds_builder_alt = load_dataset_builder("stas/openwebtext-10k")

In [4]:
total_size = sum(split.num_bytes for split in ds_builder_alt.info.splits.values()) / (1024 ** 2)
print(f"Total dataset size: ~{total_size:.2f} MB")
print("Dataset description:", ds_builder_alt.info.description)
print("\nFeatures:\n", ds_builder_alt.info.features)
print("\nSplits:\n", list(ds_builder_alt.info.splits.keys()))

Total dataset size: ~47.37 MB
Dataset description: An open-source replication of the WebText dataset from OpenAI.

This is a small subset representing the first 10K records from the original dataset - created for testing.

The full 8M-record dataset is at https://huggingface.co/datasets/openwebtext


Features:
 {'text': Value(dtype='string', id=None)}

Splits:
 ['train']


In [17]:
ds_builder.download_and_prepare()
ds = ds_builder.as_dataset()

In [5]:
ds_builder_alt.download_and_prepare()
ds_alt = ds_builder_alt.as_dataset()

In [1]:
from transformers import GPT2LMHeadModel, AutoTokenizer

model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

In [6]:
tokenizer.pad_token = tokenizer.eos_token
def tokenize(examples):
    return tokenizer(examples["text"], max_length=256, padding="max_length", truncation=True)

In [None]:
def tokenize_test(example): # for measuring performance
    tokens = tokenizer(
        example["text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    tokens["labels"] = tokens["input_ids"].copy()

In [25]:
tokenized_dataset = ds.map(tokenize_test, batched=True)

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset_alt = ds_alt.map(tokenize, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [9]:
tokenized_dataset_alt = tokenized_dataset_alt["train"].train_test_split(test_size=0.2, seed=42, shuffle=True)

In [28]:
from sklearn.metrics import top_k_accuracy_score
import torch
import numpy as np
import math

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Shift so model predicts token t+1
    shift_logits = torch.tensor(logits)[..., :-1, :].contiguous()
    shift_labels = torch.tensor(labels)[..., 1:].contiguous()

    # Flatten the tensors
    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
    shift_labels = shift_labels.view(-1)

    # Mask out padding
    valid = shift_labels != -100
    y_true = shift_labels[valid].numpy()
    y_pred = shift_logits[valid].numpy()

    # Perplexity (cross-entropy)
    loss_fct = torch.nn.CrossEntropyLoss()
    loss = loss_fct(torch.tensor(y_pred), torch.tensor(y_true))
    perplexity = math.exp(loss.item())

    # Top-k accuracy
    topk_acc = top_k_accuracy_score(y_true, y_pred, k=5, labels=list(range(50257))) # GPT-2 vocab size is 50257

    return {
        "perplexity": perplexity,
        "top5_accuracy": topk_acc
    }

In [32]:
import torch
import gc

gc.collect()
torch.cuda.empty_cache()

In [10]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

training_args = TrainingArguments(
    output_dir="./NWP_results",
    eval_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_alt["train"],
    eval_dataset=tokenized_dataset_alt["test"],
    data_collator=data_collator,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,3.03341
2,3.131700,3.029742


TrainOutput(global_step=500, training_loss=3.13165234375, metrics={'train_runtime': 976.1486, 'train_samples_per_second': 16.391, 'train_steps_per_second': 0.512, 'total_flos': 2090336256000000.0, 'train_loss': 3.13165234375, 'epoch': 2.0})

In [11]:
model.save_pretrained("./gpt2-finetuned-nwp")
tokenizer.save_pretrained("./gpt2-finetuned-nwp")

('./gpt2-finetuned-nwp\\tokenizer_config.json',
 './gpt2-finetuned-nwp\\special_tokens_map.json',
 './gpt2-finetuned-nwp\\vocab.json',
 './gpt2-finetuned-nwp\\merges.txt',
 './gpt2-finetuned-nwp\\added_tokens.json',
 './gpt2-finetuned-nwp\\tokenizer.json')

In [13]:
from transformers import GPT2LMHeadModel, AutoTokenizer

model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned-nwp")
tokenizer = AutoTokenizer.from_pretrained("./gpt2-finetuned-nwp")
model.eval() # Set model to evaluation mode
model.to("cuda")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [33]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

test_ds = tokenized_dataset["test"].shuffle(seed=42).select(range(100))

training_args = TrainingArguments(output_dir="./dummy", per_device_eval_batch_size=1, fp16=True, eval_accumulation_steps=8)

trainer = Trainer(
    model=model,
    args=training_args,
)

# Predict on test set
outputs = trainer.predict(test_ds)

MemoryError: Unable to allocate 2.30 GiB for an array with shape (96, 128, 50257) and data type float32

In [29]:
assert outputs.predictions is not None, "Predictions are None"
assert outputs.label_ids is not None, "Label IDs are None"
metrics = compute_metrics((outputs.predictions, outputs.label_ids))
print("Evaluation Metrics:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 2553055600 bytes.

In [31]:
prompt = "I like watching"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(inputs["input_ids"], max_new_tokens=2)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I like watching the movies
