# GPT с помощью модулей Huggingface

Модули:
* transformers
* tokenizers
* datasets

Примеры использования https://github.com/huggingface/notebooks/tree/main/examples

The Hugging Face Course https://github.com/huggingface/course/tree/main

Сайт компании https://huggingface.co/

История Huggingface https://ru.wikipedia.org/wiki/Hugging_Face

In [1]:
%pip --quiet install transformers tokenizers datasets accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Параметры словаря
vocab_size = 500

# Параметры модели
block_size = 256
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

# Параметры обучения
batch_size = 128

# Загрузка и предобработка датасетов

In [3]:
from datasets import load_dataset, Dataset

raw_datasets = load_dataset("abobster/pushkin_new")

def preprocess(dataset): 
    poems = '\n'.join(dataset['text']).split('</s>')
    poems = ['<BOS>' + poem.strip() + '<EOS>' for poem in poems]    
    return {'text': poems}

raw_datasets = raw_datasets.map(preprocess, batched=True, batch_size=-1)  # https://huggingface.co/docs/datasets/v2.14.5/en/package_reference/main_classes#datasets.Dataset.map    

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 522
    })
    test: Dataset({
        features: ['text'],
        num_rows: 60
    })
})

# Обучение токенайзера

In [4]:
# https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb
from tokenizers import Tokenizer, trainers, models, pre_tokenizers, decoders, processors
from transformers import GPT2TokenizerFast

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=['<BOS>', '<EOS>', '<PAD>'])
tokenizer.train_from_iterator(raw_datasets['train']['text'], trainer=trainer)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()

new_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='<PAD>', bos_token='<BOS>', eos_token='<EOS>')

sample = 'Мороз и солнце\nдень чудесный\n'
print(tokenizer.encode(sample).ids)
print([tokenizer.decode([id]) for id in tokenizer.encode(sample).ids])




[270, 178, 257, 168, 145, 171, 133, 190, 130, 92, 150, 243, 216, 251, 209, 206, 92]
['М', 'ор', 'оз', ' и', ' с', 'ол', 'н', 'ц', 'е', '\n', 'д', 'ень', ' ч', 'уд', 'ес', 'ный', '\n']


# Токенизация с разбиением на блоки

In [5]:
tokenized_datasets = raw_datasets.map(
    lambda dataset: new_tokenizer(
        [''.join(dataset['text'])],
        # dataset['text'],
        max_length=block_size,
        truncation=True,
        return_overflowing_tokens=True,
        add_special_tokens=False,
        return_length=True,
        padding=True,
    ), batched=True, batch_size=1000000, remove_columns='text')

tokenized_datasets

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 1106
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 106
    })
})

In [6]:
print(tokenized_datasets['train']['length'])

[256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256,

In [7]:
print(tokenized_datasets['train']['input_ids'][0])

[0, 299, 295, 400, 195, 165, 284, 255, 4, 402, 149, 378, 136, 200, 195, 92, 290, 286, 131, 149, 201, 363, 185, 246, 143, 165, 210, 131, 4, 92, 431, 358, 154, 413, 259, 248, 159, 476, 237, 200, 195, 4, 92, 431, 319, 154, 210, 139, 377, 483, 166, 217, 8, 92, 431, 469, 134, 183, 307, 162, 134, 130, 222, 174, 325, 144, 4, 92, 431, 469, 134, 249, 260, 134, 130, 158, 237, 132, 325, 144, 92, 176, 155, 138, 470, 167, 140, 155, 136, 157, 216, 175, 205, 157, 4, 92, 344, 183, 398, 416, 166, 135, 243, 160, 180, 133, 189, 157, 6, 92, 182, 209, 195, 135, 195, 146, 278, 161, 143, 425, 145, 247, 262, 92, 215, 150, 197, 153, 319, 205, 169, 165, 138, 228, 206, 187, 144, 150, 4, 92, 314, 296, 150, 161, 143, 233, 222, 366, 133, 143, 4, 361, 233, 319, 136, 144, 150, 4, 92, 176, 476, 138, 205, 393, 216, 228, 485, 163, 171, 211, 262, 92, 176, 150, 132, 134, 4, 227, 339, 150, 174, 144, 144, 145, 149, 208, 92, 182, 134, 319, 232, 258, 158, 133, 156, 305, 4, 482, 219, 272, 155, 208, 92, 277, 205, 154, 161, 179,

# Модель GPT2

In [8]:
from transformers import GPT2LMHeadModel, GPT2Config

# Параметры предыдущей модели
config = GPT2Config(  # https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Config
    vocab_size=len(new_tokenizer),
    bos_token_id=new_tokenizer.bos_token_id,
    eos_token_id=new_tokenizer.eos_token_id,
    n_positions=block_size,
    n_ctx=block_size,
    n_embd=n_embd,
    n_head=n_head,
    n_layer=n_layer,
    attn_pdrop=dropout,
    resid_pdrop=dropout,
    embd_pdrop=dropout,
)

# Модель GPT2
model = GPT2LMHeadModel(config).to('cuda')

model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 10.9M parameters


# Обучение модели

In [9]:
from transformers.trainer import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling, EarlyStoppingCallback


args = TrainingArguments(  # https://huggingface.co/docs/transformers/v4.34.0/en/main_classes/trainer#transformers.TrainingArguments
    report_to='tensorboard', 
    output_dir='.results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end = True,
    max_steps=10000,
)

data_collator = DataCollatorForLanguageModeling(new_tokenizer, mlm=False)

trainer = Trainer(  # https://huggingface.co/docs/transformers/v4.34.0/en/main_classes/trainer#transformers.Trainer
    model,  
    args, 
    data_collator=data_collator,
    tokenizer=new_tokenizer,
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['test'],
    callbacks=[EarlyStoppingCallback(2)],
)

trainer.train()

trainer.save_model('.2_transformers')

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,5.8302,5.551425
2,5.4953,5.352473
3,5.3364,5.211414
4,5.2124,5.070385
5,5.0859,4.931642
6,4.9677,4.813374
7,4.8643,4.710827
8,4.773,4.624661
9,4.6919,4.551373
10,4.6229,4.487481


# Генерация

In [10]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# model = AutoModelForCausalLM.from_pretrained('.2_transformers', local_files_only=True).to('cuda')
# new_tokenizer = AutoTokenizer.from_pretrained('.2_transformers', local_files_only=True)

In [20]:
outputs = model.generate(  # https://huggingface.co/docs/transformers/main_classes/text_generation
    pad_token_id=new_tokenizer.pad_token_id,
    max_new_tokens=200,
    do_sample=True,
    )

print(new_tokenizer.decode(outputs[0]))

<BOS>Вом потряходных
И пла раз речивет
Исли омалом, поднем,
Оспорозвозях отсницает
И Мароды пастом
И напире проходился;
И гробзабытный
И черины;
Но правистотовый ю,
Измисенной рек будетствеволько — этом.
Под соманный дал,
Солосходит.
Я жаряход клиста жел?
Я ретно, пиром
Те том нем с древрожу,
Преретной гластнимо дом
Душенная Голотови,
Постиц


# Параметры модели

In [12]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(501, 384)
    (wpe): Embedding(256, 384)
    (drop): Dropout(p=0.2, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.2, inplace=False)
          (resid_dropout): Dropout(p=0.2, inplace=False)
        )
        (ln_2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.2, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=384, out_features=501, bias=False)
)

In [13]:
model.config

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.2,
  "bos_token_id": 0,
  "embd_pdrop": 0.2,
  "eos_token_id": 1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 256,
  "n_embd": 384,
  "n_head": 6,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.2,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.34.1",
  "use_cache": true,
  "vocab_size": 501
}