# GPT с помощью модулей Huggingface

Модули:
* transformers
* tokenizers
* datasets

Примеры использования https://github.com/huggingface/notebooks/tree/main/examples

The Hugging Face Course https://github.com/huggingface/course/tree/main

Сайт компании https://huggingface.co/

История Huggingface https://ru.wikipedia.org/wiki/Hugging_Face

In [1]:
%pip --quiet install transformers tokenizers datasets accelerate

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Параметры словаря
vocab_size=5000

# Параметры модели
block_size = 256
n_embd = 384
n_head = 6
n_layer = 6

# Параметры обучения
batch_size = 128

# Загрузка и предобработка датасетов

In [3]:
from datasets import load_dataset, Dataset

raw_datasets = load_dataset("abobster/pushkin_new")

def preprocess(dataset): 
    poems = '\n'.join(dataset['text']).split('</s>')
    poems = ['<BOS>' + poem.strip() + '<EOS>' for poem in poems]    
    return {'text': poems}

raw_datasets = raw_datasets.map(preprocess, batched=True, batch_size=-1)  # https://huggingface.co/docs/datasets/v2.14.5/en/package_reference/main_classes#datasets.Dataset.map    

raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 522
    })
    test: Dataset({
        features: ['text'],
        num_rows: 60
    })
})

# Обучение токенайзера

In [4]:
# https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb
from tokenizers import Tokenizer, trainers, models, pre_tokenizers, decoders, processors
from transformers import GPT2TokenizerFast

text = '\n'.join(raw_datasets['train']['text'])

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=['<BOS>', '<EOS>', '<PAD>'])
tokenizer.train_from_iterator([text], trainer=trainer)
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()

new_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='<PAD>', bos_token='<BOS>', eos_token='<EOS>')

sample = 'Мороз и солнце\nдень чудесный\n'
print(tokenizer.encode(sample).ids)
print([tokenizer.decode([id]) for id in tokenizer.encode(sample).ids])




[4053, 257, 168, 2424, 92, 150, 243, 3015, 206, 92]
['Мор', 'оз', ' и', ' солнце', '\n', 'д', 'ень', ' чудес', 'ный', '\n']


# Токенизация с разбиением на блоки

In [5]:
new_tokenizer.padding_side = "left"
# new_tokenizer.add_bos_token = False
# new_tokenizer.add_eos_token = False

tokenized_datasets = raw_datasets.map(
    lambda dataset: new_tokenizer(
        # [''.join(dataset['text'])],
        dataset['text'],
        max_length=block_size,
        truncation=True,
        return_overflowing_tokens=True,
        add_special_tokens=False,
        return_length=True,
        padding=True,
    ), batched=True, batch_size=1000000, remove_columns='text')

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 969
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'length', 'overflow_to_sample_mapping'],
        num_rows: 105
    })
})

In [6]:
print(tokenized_datasets['train']['length'])

[256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256,

In [7]:
print(tokenized_datasets['train']['input_ids'][0])

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 299, 1322, 195, 3210, 4, 402, 3474, 92, 2545, 682, 603, 568, 165, 810, 4, 92, 431, 4849, 259, 584, 4617, 4, 92, 431, 1894, 139, 377, 483, 595, 8, 92, 431, 1628, 974, 2222, 3223, 4, 92, 431, 1628, 3381, 535, 3472, 92, 176, 155, 138, 470, 1258, 157, 746, 3114, 4, 92, 344, 1566, 416, 771, 932, 180, 3573, 157, 6, 92, 1229, 1451, 195, 4128, 425, 1394, 92, 698, 1091, 1666, 169, 2829, 2155, 4, 92, 314, 296, 570, 143, 233, 1125, 4, 361, 233, 319, 4692, 4, 92, 176, 2959, 393, 2897, 485, 4749, 92, 176, 3080, 4, 227, 1239, 1476, 144, 145, 589, 92, 755, 319, 2091, 1283, 518, 4, 482, 1916, 92, 4681, 179, 219, 199, 145, 2527, 548, 301, 4, 92, 1229, 1451, 195, 3039, 3622, 146, 564, 92, 2804, 2059, 684, 178, 234, 284, 301, 92, 277, 443, 604, 3895, 388, 1082, 1081, 4, 92, 1012, 732, 388, 1694, 200, 3359, 3354, 4, 92, 745, 2063, 874, 3322, 1343, 2596, 3354, 4, 92, 745, 1756, 3334, 2248, 2177, 6, 92, 1505, 859, 181, 303, 619, 220, 561, 4, 92, 3146, 236, 233, 320, 9

# Модель GPT2

In [8]:
from transformers import GPT2LMHeadModel, GPT2Config

# Параметры предыдущей модели
config = GPT2Config(  # https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Config
    vocab_size=len(new_tokenizer),
    bos_token_id=new_tokenizer.bos_token_id,
    eos_token_id=new_tokenizer.eos_token_id,
    n_positions=block_size,
    n_ctx=block_size,
    n_embd=n_embd,
    n_head=n_head,
    n_layer=n_layer,
)

# Модель GPT2
model = GPT2LMHeadModel(config).to('cuda')

model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 12.7M parameters


# Обучение модели

In [10]:
from transformers.trainer import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling, EarlyStoppingCallback


args = TrainingArguments(  # https://huggingface.co/docs/transformers/v4.34.0/en/main_classes/trainer#transformers.TrainingArguments
    report_to='tensorboard', 
    output_dir='.results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end = True,
    max_steps=10000,
)

data_collator = DataCollatorForLanguageModeling(new_tokenizer, mlm=False)

trainer = Trainer(  # https://huggingface.co/docs/transformers/v4.34.0/en/main_classes/trainer#transformers.Trainer
    model,  
    args, 
    data_collator=data_collator,
    tokenizer=new_tokenizer,
    train_dataset=tokenized_datasets['train'], 
    eval_dataset=tokenized_datasets['test'],
    callbacks=[EarlyStoppingCallback(3)],
)

trainer.train()

trainer.save_model('.results/2_transformers')

Epoch,Training Loss,Validation Loss
1,5.888,5.956108
2,5.8384,5.929408
3,5.7994,5.901773
4,5.7705,5.890111
5,5.7387,5.865388
6,5.7089,5.841716
7,5.6801,5.834176
8,5.6527,5.82071
9,5.6275,5.799744
10,5.6033,5.797341


# Генерация

In [11]:
# model = GPT2LMHeadModel.from_pretrained('.results/2_transformers', local_files_only=True).to('cuda')

outputs = model.generate(  # https://huggingface.co/docs/transformers/main_classes/text_generation
    max_new_tokens=100,
    eos_token_id=new_tokenizer.eos_token_id,
    pad_token_id=new_tokenizer.pad_token_id,
    do_sample=True,
    )

print(new_tokenizer.decode(outputs[0]))

<BOS>Вижу я тес! я в тишине;
С ним и в нему над темные,
И даже к ней —
Лит. ты!
В сердце в темной и я бивной
И кой меня в минуты
Тылает, с улыбкой в томной
Как я что же!
Пебомах…
Он я ли в минутенья
Но в немить с бо
Как собла с тем! Солла


# Параметры модели

In [12]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(5001, 384)
    (wpe): Embedding(256, 384)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=384, out_features=5001, bias=False)
)

In [14]:
model.config

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 256,
  "n_embd": 384,
  "n_head": 6,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.34.0",
  "use_cache": true,
  "vocab_size": 5001
}