***Fine-Tuning***

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import math
import torch

# 1. Вибір моделі та токенізатора
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [2]:
# 2. Підготовка датасету
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

train_dataset = load_dataset("wh_fb.txt", tokenizer)



In [3]:
# 3. Датаколлатор для маскування
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [4]:
# 4. Параметри тренування
training_args = TrainingArguments(
    output_dir="./finetuned_gpt2",
    overwrite_output_dir=True,
    num_train_epochs=100,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate=2e-5,
    logging_steps=200,
    no_cuda=False,
    fp16=True,
    report_to="none",
)


In [5]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset
)

In [6]:
# 6. Тренування
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
200,3.0447
400,2.3938
600,1.9942
800,1.6176
1000,1.2981
1200,1.0339
1400,0.8351
1600,0.6874
1800,0.5724
2000,0.4942


TrainOutput(global_step=3000, training_loss=1.0576979700724285, metrics={'train_runtime': 587.8391, 'train_samples_per_second': 41.338, 'train_steps_per_second': 5.103, 'total_flos': 1536397148160000.0, 'train_loss': 1.0576979700724285, 'epoch': 96.78688524590164})

In [7]:
# 7. Збереження моделі
trainer.save_model("./finetuned_gpt2")
tokenizer.save_pretrained("./finetuned_gpt2")

('./finetuned_gpt2\\tokenizer_config.json',
 './finetuned_gpt2\\special_tokens_map.json',
 './finetuned_gpt2\\vocab.json',
 './finetuned_gpt2\\merges.txt',
 './finetuned_gpt2\\added_tokens.json')

In [8]:
# 8. Оцінка Perplexity
def compute_perplexity(model, tokenizer, text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        perplexity = math.exp(loss.item())
    return perplexity

sample_text = "Once upon a time"
perplexity = compute_perplexity(model, tokenizer, sample_text)
print(f"Perplexity: {perplexity:.4f}")


Perplexity: 15.9479


In [9]:
# 9. Генерація тексту
def generate_text(prompt, model, tokenizer, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_p=0.8,
        temperature=0.8
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

prompt = "Gotrek looks at Felix and says"
generated_text = generate_text(prompt, model, tokenizer)
print(f"Generated text:\n{generated_text}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated text:
Gotrek looks at Felix and says: ‘I wonder if luck has anything to do with it.’

‘Felix doesn—t have much to say, really. Just get on with things. ’ He glances around at the hall and thinks for a moment, then changes his mind. He changes the subject.
Snorri thinks the same. It is not unusual these days. Last month Gotrek and Snorri had a brief encounter in the Wast
