# Тонкая настройка предобученной модели

Следующий код написан на основе тьюториала https://deci.ai/blog/fine-tune-llama-2-with-lora-for-question-answering/

In [1]:
%pip install --quiet transformers datasets peft bitsandbytes accelerate

Note: you may need to restart the kernel to use updated packages.


# Подготовка данных для обучения

In [2]:
from datasets import load_dataset, Dataset

raw_datasets = load_dataset("abobster/pushkin_new")

def preprocess(dataset): 
    poems = '\n'.join(dataset['text']).split('</s>')
    poems = ['<s>' + poem.strip() + '</s>' for poem in poems]
    return {'text': poems}

raw_datasets = raw_datasets.map(preprocess, batched=True, batch_size=-1)  # https://huggingface.co/docs/datasets/v2.14.5/en/package_reference/main_classes#datasets.Dataset.map

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 522
    })
    test: Dataset({
        features: ['text'],
        num_rows: 60
    })
})

# Загрузка и квантизация предобученной модели и токенайзера

In [3]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM
import torch

model_name = "ai-forever/ruGPT-3.5-13B"
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, torch_dtype=torch.bfloat16)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["c_attn"],
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

model_size = sum(t.numel() for t in model.parameters())
print(f"model_size: {model_size/1000**2:.1f}M")

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

trainable params: 6,553,600 || all params: 12,860,016,640 || trainable%: 0.05096105381089149
model_size: 12860.0M


In [4]:
from transformers import AutoTokenizer

model_name = "ai-forever/ruGPT-3.5-13B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False

print(tokenizer.special_tokens_map)

{'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'mask_token': '<mask>'}


# Токенизация с разбиением на блоки

In [5]:
tokenizer.padding_side = "left"

block_size = 256  # Больше не влазит

tokenized_datasets = raw_datasets.map(
    lambda dataset: tokenizer(
        [''.join(dataset['text'])],
        # dataset['text'],
        max_length=block_size,
        truncation=True,
        return_overflowing_tokens=True,
        add_special_tokens=False,
        return_length=True,
        padding=True,
    ), batched=True, batch_size=1000000, remove_columns='text')

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 630
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 61
    })
})

# Дообучение модели

In [6]:
from transformers.trainer import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling, EarlyStoppingCallback

batch_size = 1  # Больше не влазит

args = TrainingArguments(
    report_to='tensorboard', 
    output_dir='.results',
    load_best_model_at_end = True,
    max_steps=10000,
    # Следующие значения отсюда https://github.com/IlyaGusev/rulm/blob/master/self_instruct/configs/gigasaiga_13b.json
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=128,
    eval_steps=10,
    save_steps=10,
    logging_steps=5,
    learning_rate=0.0003,
    # num_train_epochs=5,
    lr_scheduler_type="cosine",
    warmup_steps=30,
    fp16=False,
    bf16=True,
    torch_compile=False,
    optim="adamw_torch"
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model,  
    args, 
    data_collator=data_collator,
    tokenizer=tokenizer,
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['test'],
    callbacks=[EarlyStoppingCallback(3)],
)

trainer.train()

trainer.save_model('.7_fine_tuning')

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
10,1.8319,1.744358
20,1.7811,1.658523
30,1.6893,1.593784
40,1.6258,1.498931
50,1.5575,1.469339
60,1.5247,1.455569
70,1.4686,1.454757
80,1.4274,1.461012
90,1.3856,1.465868
100,1.3616,1.473294




# Генерация текста

In [33]:
from transformers import GenerationConfig

prefix = '<s>'

generation_config = GenerationConfig(
  bos_token_id=tokenizer.bos_token_id,
  eos_token_id=tokenizer.eos_token_id,
  pad_token_id=tokenizer.pad_token_id,
  do_sample=True,
  max_new_tokens=100,
  no_repeat_ngram_size=15,
  repetition_penalty=1.15,
  temperature=0.2,
  top_k=30,
  top_p=0.9,
)

inputs = tokenizer(prefix, return_tensors='pt').to('cuda')

outputs = model.generate(
    **inputs, 
    generation_config=generation_config,
    )

print(tokenizer.decode(outputs[0]))

<s>
И, как поэт, я не могу
Без слез оставить это дело.
Я знаю: век уж мой измерен;
Но чтоб продлилась жизнь моя,
Я утром должен быть уверен,
Что с вами днем увижусь я…</s>
