In [None]:
from transformers import GPT2LMHeadModel, AutoTokenizer

model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned-nwp-final")
tokenizer = AutoTokenizer.from_pretrained("./gpt2-finetuned-nwp-final")
model.eval() # Set model to evaluation mode

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [5]:
print(model.config)

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.53.0",
  "use_cache": true,
  "vocab_size": 50257
}



In [18]:
from datasets import load_dataset

ds = load_dataset("wikitext", "wikitext-2-v1")

def tokenize(examples):
    tokens = tokenizer(
        examples["text"],
        max_length=256,
        padding="max_length",
        truncation=True,
    )
    tokens["labels"] = tokens["input_ids"][:]  # Creates a shallow copy
    return tokens

tokenized_dataset = ds.map(tokenize, batched=True)

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [19]:
test_ds = tokenized_dataset["test"].shuffle(seed=42).select(range(10))  # Use a smaller subset for testing

In [20]:
from transformers import Trainer, TrainingArguments
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()

training_args = TrainingArguments(output_dir="./dummy", per_device_eval_batch_size=1, fp16=True, eval_accumulation_steps=2, remove_unused_columns=False)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
)

# Predict on test set
outputs = trainer.predict(test_ds)

In [21]:
from sklearn.metrics import top_k_accuracy_score
import torch
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Convert to torch tensors (on CPU)
    logits = torch.tensor(logits, dtype=torch.float32)
    labels = torch.tensor(labels, dtype=torch.long)

    # Shift logits and labels for next-token prediction
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()

    # Flatten
    shift_logits = shift_logits.view(-1, shift_logits.size(-1))
    shift_labels = shift_labels.view(-1)

    # Mask padding (if any)
    valid = shift_labels != -100
    y_true = shift_labels[valid].cpu().numpy()
    y_pred = shift_logits[valid].cpu().numpy()

    # Compute top-5 accuracy
    top5 = top_k_accuracy_score(y_true, y_pred, k=5, labels=np.arange(50257))

    return {"top5_accuracy": top5}

# Use after prediction
metrics = compute_metrics((outputs.predictions, outputs.label_ids))
print(f"Top-5 Accuracy: {metrics['top5_accuracy']:.4f}")


Top-5 Accuracy: 0.1996


In [None]:
from evaluate import load

perplexity = load("perplexity")

if "text" in test_ds.features:
    raw_test_texts = [t for t in test_ds["text"] if t.strip() != ""]
else:
    raise ValueError("Test dataset does not have a 'text' column")

# Avoid sending empty predictions
if not raw_test_texts:
    raise ValueError("No valid text samples found in test dataset.")

results = perplexity.compute(
    predictions=raw_test_texts,
    model_id="./gpt2-finetuned-nwp-final",
    device="cuda" if torch.cuda.is_available() else "cpu",
    batch_size=4
)

print("Raw results:", results)
if "perplexities" in results:
    print("Mean Perplexity:", sum(results["perplexities"]) / len(results["perplexities"]))
else:
    print("Unexpected output format:", results)


  0%|          | 0/2 [00:00<?, ?it/s]

Raw results: {'perplexities': [18.486928939819336, 5.389094352722168, 22.24803924560547, 23.929136276245117, 15.664121627807617, 21.0636043548584, 14.778168678283691], 'mean_perplexity': 17.365584782191686}
Mean Perplexity: 17.365584782191686
