Установим необходимые библиотеки

In [None]:
!pip install transformers datasets accelerate -U

Collecting transformers
  Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

Умпортируем зависимости

In [None]:
import pandas as pd
import random
import os
import torch
from datasets import load_dataset, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

Загрузим датасет и разделим его на выборки

In [None]:
data_path = "./books.txt"

with open(data_path, 'r', encoding='utf-8') as file:
    data = file.readlines()

data = [line.strip() for line in data if line.strip()]

data_size = len(data)
print('Total lines: ', data_size)

val_size = round(data_size / 2)

random.seed(1234)

val_indices = random.sample(range(data_size), val_size)
train_data = [data[i] for i in range(data_size) if i not in val_indices]
val_data = [data[i] for i in val_indices]

with open("train.txt", 'w', encoding='utf-8') as file:
    file.write("\n".join(train_data))

with open("valid.txt", 'w', encoding='utf-8') as file:
    file.write("\n".join(val_data))

Total lines:  865


Создаём и настраиванем токенизатор. Форматируем датасет

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained('gpt2')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")

train_texts = open('train.txt').read().splitlines()
val_texts = open('valid.txt').read().splitlines()

train_dataset = Dataset.from_dict({"text": train_texts})
val_dataset = Dataset.from_dict({"text": val_texts})

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

train_dataset = train_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True)
val_dataset = val_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/433 [00:00<?, ? examples/s]

Map:   0%|          | 0/432 [00:00<?, ? examples/s]

Map:   0%|          | 0/433 [00:00<?, ? examples/s]

Map:   0%|          | 0/432 [00:00<?, ? examples/s]

Настраиваем тренер

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

Запускаем тренировку модели и сохраняем её

In [7]:
trainer.train()

model.save_pretrained('./gpt2_model')
tokenizer.save_pretrained('./gpt2_model')

Epoch,Training Loss,Validation Loss
1,No log,0.661564


Epoch,Training Loss,Validation Loss
1,No log,0.661564
2,No log,0.640784
3,No log,0.634226


('./gpt2_model/tokenizer_config.json',
 './gpt2_model/special_tokens_map.json',
 './gpt2_model/vocab.json',
 './gpt2_model/merges.txt',
 './gpt2_model/added_tokens.json')

Загружаем токенизатор и модель. Генерируем текст

In [12]:
model_path = "./gpt2_model"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.to("cpu")

def generate_text(prompt, max_length=100, temperature=0.7, top_k=50, top_p=0.5):
    inputs = tokenizer.encode(prompt, return_tensors="pt").to("cpu")
    outputs = model.generate(
        inputs,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

prompt = "Привет"
print(generate_text(prompt))

Привет в призвет в призвет в призвет в призвет в призвет в призвет в призвет в призвет в пр
