# Тонкая настройка предобученной модели

Теперь соберём все части вместе

In [1]:
%pip install --quiet transformers datasets peft bitsandbytes accelerate

Note: you may need to restart the kernel to use updated packages.


# Подготовка данных для обучения

In [2]:
from datasets import load_dataset, Dataset

raw_datasets = load_dataset("abobster/pushkin_new")

def preprocess(dataset): 
    poems = '\n'.join(dataset['text']).split('</s>')
    poems = ['<s>' + poem.strip() + '</s>' for poem in poems]
    return {'text': poems}

raw_datasets = raw_datasets.map(preprocess, batched=True, batch_size=-1)

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 522
    })
    test: Dataset({
        features: ['text'],
        num_rows: 60
    })
})

# Загрузка и квантизация предобученной модели и токенайзера

In [3]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM
import torch

model_name = "ai-forever/ruGPT-3.5-13B"
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["c_attn"],
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

model_size = sum(t.numel() for t in model.parameters())
print(f"model_size: {model_size/1000**2:.1f}M")

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

trainable params: 6,553,600 || all params: 12,860,016,640 || trainable%: 0.05096105381089149
model_size: 12860.0M


In [4]:
from transformers import AutoTokenizer

model_name = "ai-forever/ruGPT-3.5-13B"

tokenizer = AutoTokenizer.from_pretrained(model_name)

print(tokenizer.special_tokens_map)

{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'mask_token': '<mask>'}


# Токенизация с разбиением на блоки

In [5]:
tokenizer.padding_side = "right"

block_size = 256  # Больше не влазитв GPU

tokenized_datasets = raw_datasets.map(
    lambda dataset: tokenizer(
        [''.join(dataset['text'])],
        # dataset['text'],
        max_length=block_size,
        truncation=True,
        return_overflowing_tokens=True,
        add_special_tokens=False,
        return_length=True,
        padding=True,
    ), batched=True, batch_size=1000000, remove_columns='text')

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 630
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 61
    })
})

# Дообучение модели

In [6]:
from transformers.trainer import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling, EarlyStoppingCallback

batch_size = 1  # Больше не влазит в GPU

args = TrainingArguments(
    report_to='tensorboard', 
    output_dir='.results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end = True,
    max_steps=10000,
    gradient_accumulation_steps=128,
    learning_rate=0.0003,
    lr_scheduler_type="cosine",
    warmup_steps=30,
    fp16=False,
    bf16=True,
    torch_compile=False,
    optim="adamw_torch"
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model,  
    args, 
    data_collator=data_collator,
    tokenizer=tokenizer,
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['test'],
    callbacks=[EarlyStoppingCallback(3)],
)

trainer.train()

trainer.save_model('.7_fine_tuning')

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
0,2.265,1.733189
1,1.7952,1.69809
2,1.767,1.654749
3,1.7285,1.615976
4,1.692,1.585894
5,1.6624,1.55052
6,1.6227,1.504128
7,1.5712,1.471451
8,1.5327,1.435055
9,1.5029,1.427919




# Генерация текста

In [7]:
from transformers import GenerationConfig

generation_config = GenerationConfig(
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id,
  pad_token_id=tokenizer.pad_token_id,
  do_sample=True,
  max_new_tokens=200,
  no_repeat_ngram_size=15,
  repetition_penalty=1.15,
  temperature=0.2,
  top_k=30,
  top_p=0.9,
)

generation_config.save_pretrained('.7_fine_tuning')

In [8]:
outputs = model.generate(
    generation_config=generation_config
    )

print(tokenizer.decode(outputs[0]))

<s>
И, внемля ей, утешен я душой.
Но ты, мой друг, всегда ли будешь рада
Моим советам? Не забудешь ль их?
Не слишком часто станешь им внимать?</s>
