In [1]:
!pip install transformers==4.53.1 datasets evaluate --quiet


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, GPT2LMHeadModel
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
import math 

In [3]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1") 

In [4]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad token for GPT-2 

In [5]:
def tokenize_function(example):
    return tokenizer(example["text"],  truncation=True, padding="max_length", max_length=64)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])   

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal language modeling for GPT-2
)

In [7]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  

Embedding(50257, 768)

In [8]:
training_args = TrainingArguments(
    output_dir="./results",

    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=1,
    per_device_train_batch_size=4,
    logging_dir="./logs",
    save_total_limit=1,
    eval_strategy='epoch',
)  

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train() 

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,3.3347,3.489219




TrainOutput(global_step=9180, training_loss=3.454555469920173, metrics={'train_runtime': 49380.2548, 'train_samples_per_second': 0.744, 'train_steps_per_second': 0.186, 'total_flos': 1199265103872000.0, 'train_loss': 3.454555469920173, 'epoch': 1.0})

In [10]:
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results["eval_loss"])
print(f"Perplexity: {perplexity:.2f}")


Perplexity: 32.76


In [11]:
input_text = "Machine learning helps"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(input_ids, max_new_tokens=10, do_sample=True)
print("Input:", input_text)
print("Output:", tokenizer.decode(output[0]))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input: Machine learning helps
Output: Machine learning helps solve problems in complex domain models , where the theory
