In [1]:
!pip install -U transformers datasets accelerate

Collecting datasets
  Downloading datasets-4.4.2-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.2-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-4.4.2 pya

In [2]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.9.0+cu126
True


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM  # type: ignore
from transformers import Trainer, TrainingArguments # type: ignore
from datasets import load_dataset # type: ignore
import numpy as np

In [27]:
model_name = "gpt2"
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.eos_token_id

In [32]:
dataset = dataset.filter(
    lambda x: x["text"] is not None and len(x["text"].strip()) > 0
)


Filter:   0%|          | 0/4358 [00:00<?, ? examples/s]

Filter:   0%|          | 0/36718 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [33]:
print(dataset.shape)
# drop not needed parts of dataset here

{'test': (2891, 1), 'train': (23767, 1), 'validation': (2461, 1)}


In [34]:
print(dataset["train"][5]["text"])

 As with previous Valkyira Chronicles games , Valkyria Chronicles III is a tactical role @-@ playing game where players take control of a military unit and take part in missions against enemy forces . Stories are told through comic book @-@ like panels with animated character portraits , with characters speaking partially through voiced speech bubbles and partially through unvoiced text . The player progresses through a series of linear missions , gradually unlocked as maps that can be freely scanned through and replayed as they are unlocked . The route to each story location on the map varies depending on an individual player 's approach : when one option is selected , the other is sealed off to the player . Outside missions , the player characters rest in a camp , where units can be customized and character growth occurs . Alongside the main story missions are character @-@ specific sub missions relating to different squad members . After the game 's completion , additional episodes 

In [39]:
def tokenize (batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=128
    )

tokenized_dataset = dataset.map(
        tokenize,
        batched = True,
        remove_columns = ["text"]
    )


Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

Map:   0%|          | 0/23767 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

In [40]:
tokenized_dataset = tokenized_dataset.filter(
    lambda x: len(x["input_ids"]) > 0
)

tokenized_dataset.set_format("torch")

Filter:   0%|          | 0/2891 [00:00<?, ? examples/s]

Filter:   0%|          | 0/23767 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2461 [00:00<?, ? examples/s]

In [41]:
print(tokenized_dataset.shape)

{'test': (2891, 2), 'train': (23767, 2), 'validation': (2461, 2)}


In [38]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [47]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    report_to="tensorboard",
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.6535,3.466905
2,2.625,3.491901


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=5942, training_loss=2.63923131942107, metrics={'train_runtime': 838.3809, 'train_samples_per_second': 56.697, 'train_steps_per_second': 7.087, 'total_flos': 3092815798272000.0, 'train_loss': 2.63923131942107, 'epoch': 2.0})

A huge model like gpt2 is mild-overfitting even from epoch 1 since our datset is small relatively.

In [48]:
import math

eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]

perplexity = math.exp(eval_loss)
print(f"Validation loss: {eval_loss:.4f}")
print(f"Perplexity: {perplexity:.2f}")


Validation loss: 3.4669
Perplexity: 32.04


In [49]:
save_dir = "./gpt2_wikitext_finetuned"

trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

('./gpt2_wikitext_finetuned/tokenizer_config.json',
 './gpt2_wikitext_finetuned/special_tokens_map.json',
 './gpt2_wikitext_finetuned/vocab.json',
 './gpt2_wikitext_finetuned/merges.txt',
 './gpt2_wikitext_finetuned/added_tokens.json',
 './gpt2_wikitext_finetuned/tokenizer.json')

In [50]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(save_dir)
tokenizer = AutoTokenizer.from_pretrained(save_dir)

tokenizer.pad_token = tokenizer.eos_token
model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [51]:
prompt = "Artificial intelligence is"

inputs = tokenizer(prompt, return_tensors="pt").to(device)

outputs = model.generate(
    **inputs,
    max_new_tokens=20,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Artificial intelligence is being driven to produce artificial intelligence , by machines that can abstract their thoughts from their neural information and build


In [52]:
outputs = model.generate(
    **inputs,
    max_new_tokens=40,
    do_sample=True,
    top_p=0.9,
    temperature=0.7,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.eos_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Artificial intelligence is becoming more widespread , and AI can be used in a variety of applications . In particular it has been used to create customized products that are able perform tasks like diagnosis or search for diseases . 
 =
