# GPT с помощью модулей Huggingface

Модули:
* transformers
* tokenizers
* datasets

Примеры использования https://github.com/huggingface/notebooks/tree/main/examples

The Hugging Face Course https://github.com/huggingface/course/tree/main

Сайт компании https://huggingface.co/

История Huggingface https://ru.wikipedia.org/wiki/Hugging_Face

In [1]:
%pip --quiet install transformers tokenizers datasets accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Параметры модели
vocab_size=5000
block_size = 256
n_embd = 384
n_head = 6
n_layer = 6

# Загрузка датасетов

In [3]:
from datasets import load_dataset

raw_datasets = load_dataset("abobster/pushkin_new")

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 19785
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1974
    })
})

# Обучение токенайзера

In [4]:
# https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb
from tokenizers import Tokenizer, trainers, models, pre_tokenizers, decoders, processors
from transformers import GPT2TokenizerFast

text = '\n'.join(raw_datasets['train']['text'])

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=['<|endoftext|>'])
tokenizer.train_from_iterator([text], trainer=trainer)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()
new_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='<|endoftext|>', bos_token='<|endoftext|>', eos_token='<|endoftext|>')

sample = 'Мороз и солнце\nдень чудесный\n'
print(tokenizer.encode(sample).ids)
print([tokenizer.decode([id]) for id in tokenizer.encode(sample).ids])





[4048, 254, 166, 2419, 90, 148, 240, 3010, 204, 90]
['Мор', 'оз', ' и', ' солнце', '\n', 'д', 'ень', ' чудес', 'ный', '\n']


# Токенизация с разбиением на блоки

In [5]:
# https://huggingface.co/learn/nlp-course/en/chapter7/6?fw=pt

def tokenize(element):
    poems = '\n'.join(element['text']).strip().split('</s>') # Объединение строк в стихотворения
    # poems = ['<|endoftext|>'.join(poems)]
    poems = [poem + '<|endoftext|>' for poem in poems]

    outputs = new_tokenizer(  # https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__
        poems,
        max_length=block_size,
        truncation=True,
        return_overflowing_tokens=True,
    )

    return outputs

tokenized_datasets = raw_datasets.map(tokenize, batched=True, batch_size=1000000, remove_columns='text')

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 975
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 105
    })
})

# Модель

In [6]:
from transformers import GPT2LMHeadModel, GPT2Config

# Параметры предыдущей модели
config = GPT2Config(  # https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Config
    vocab_size=len(new_tokenizer),
    bos_token_id=new_tokenizer.bos_token_id,
    eos_token_id=new_tokenizer.eos_token_id,
    n_positions=block_size,
    n_ctx=block_size,
    n_embd=n_embd,
    n_head=n_head,
    n_layer=n_layer,
)

# Модель GPT2
model = GPT2LMHeadModel(config).to('cuda')

model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 12.7M parameters


# Обучение

In [None]:
from transformers.trainer import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling, EarlyStoppingCallback

batch_size = 128

args = TrainingArguments(  # https://huggingface.co/docs/transformers/v4.34.0/en/main_classes/trainer#transformers.TrainingArguments
    report_to='tensorboard', 
    output_dir='.results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end = True,
    max_steps=10000,
)

data_collator = DataCollatorForLanguageModeling(new_tokenizer, mlm=False)

trainer = Trainer(  # https://huggingface.co/docs/transformers/v4.34.0/en/main_classes/trainer#transformers.Trainer
    model,  
    args, 
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['test'],
    callbacks=[EarlyStoppingCallback(3)],
)

trainer.train()

trainer.save_model('.results/2_transformers')

# Генерация

In [8]:
# model = GPT2LMHeadModel.from_pretrained('.results/2_transformers', local_files_only=True).to('cuda')

In [9]:
prefix = 'О сколько нам открытий чудных'

inputs = new_tokenizer(prefix, return_tensors='pt').to('cuda')

outputs = model.generate(  # https://huggingface.co/docs/transformers/main_classes/text_generation
    **inputs, 
    do_sample=True,
    max_new_tokens=100,
    eos_token_id=new_tokenizer.eos_token_id,
    pad_token_id=new_tokenizer.pad_token_id,
    )

print(new_tokenizer.decode(outputs[0]))

О сколько нам открытий чудных,
Оем к то, к ней и
То с ним на пирах и в ребные…
Гу в нас не пред ней?
«Все не я там на нем
С старца оберозом не так,
Он в окно ль
«Пут, не бо, за край! подавит:
Сосой, что ж вы;
А то, коптел,
Но с ним с ним, и
Кто
