# Тонкая настройка предобученной модели

In [None]:
%pip --quiet install transformers tokenizers datasets accelerate

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("abobster/pushkin_new")

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 19785
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1974
    })
})

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# model_name = 'ai-forever/mGPT'
model_name = 'ai-forever/rugpt3small_based_on_gpt2'
# model_name = 'ai-forever/rugpt3medium_based_on_gpt2'
# model_name = 'ai-forever/rugpt3large_based_on_gpt2'

model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model_size = sum(t.numel() for t in model.parameters())
print(f"model_name: {model_name}")
print(f"model_size: {model_size/1000**2:.1f}M")

print(tokenizer)

Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


model_name: ai-forever/rugpt3small_based_on_gpt2
model_size: 125.2M
GPT2TokenizerFast(name_or_path='ai-forever/rugpt3small_based_on_gpt2', vocab_size=50257, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50257: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


In [3]:
model.config

GPT2Config {
  "_name_or_path": "ai-forever/rugpt3small_based_on_gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 2048,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 2048,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.34.0",
  "use_cache": true,
  "vocab_size": 50264
}

In [5]:
def tokenize(element):
    poems = '\n'.join(element['text']).strip().split('</s>') # Объединение строк в стихотворения
    poems = ['<|endoftext|>'.join(poems)]
    # poems = [poem + '<|endoftext|>' for poem in poems]

    outputs = tokenizer(  # https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__
        poems,
        max_length=model.config.n_ctx,
        truncation=True,
        return_overflowing_tokens=True,
    )

    return outputs

tokenized_datasets = raw_datasets.map(tokenize, batched=True, batch_size=1000000, remove_columns='text')

tokenized_datasets

Map:   0%|          | 0/19785 [00:00<?, ? examples/s]

Map:   0%|          | 0/1974 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 80
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 8
    })
})

In [None]:
from transformers.trainer import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling, EarlyStoppingCallback

batch_size = 1

args = TrainingArguments(  # https://huggingface.co/docs/transformers/v4.34.0/en/main_classes/trainer#transformers.TrainingArguments
    report_to='tensorboard', 
    output_dir='results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    max_steps=10000,
    use_cpu=False,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end = True,
    gradient_accumulation_steps = tokenized_datasets['train'].num_rows//batch_size
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(  # https://huggingface.co/docs/transformers/v4.34.0/en/main_classes/trainer#transformers.Trainer
    model,
    args, 
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['test'],
    callbacks=[EarlyStoppingCallback(3)],
)
trainer.train()
trainer.save_model('4_fine_tuning')

In [8]:
prefix = 'О сколько нам открытий чудных'

inputs = tokenizer(prefix, return_tensors='pt').to('cuda')

outputs = model.generate(  # https://huggingface.co/docs/transformers/main_classes/text_generation
    **inputs, 
    do_sample=True,
    max_new_tokens=100,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    )

print(tokenizer.decode(outputs[0]))

О сколько нам открытий чудных!
Так легко ль было любить,
Когда не с грехами и добросердечным
Предводителем был ты.
И что было с тобой? Чем вдохновенны были?
Зачем ты здесь? Почему в сердце твоей
Он горит так же живо, как ты,
И не знает: его любить
Мне теперь боязно… Я хотел
И с тобою, мой друг, проститься;
Но ты знаешь: дружбы нет с любовницей,
Но между нами, по
