<a href="https://colab.research.google.com/github/sportcman/Gpt/blob/main/Colab_maximum_Gpt_TPU_v_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [None]:
pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])


In [None]:
pip install torch-xla

In [None]:
pip install accelerate -U

In [None]:
pip install torch torchvision torchaudio

In [None]:
pip install tqdm

**`Создание модели. Оптимальна.`**

In [None]:
from transformers import GPT2Tokenizer, GPT2Config, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained('/gdrive/MyDrive/TrenerGpt/tokenizer')

model_config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_layer=22,
    n_head=32,
    n_embd=2048,
    intermediate_size=3072,
    hidden_size=1536,
    max_position_embeddings=3072,
    gradient_checkpointing=True,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    sep_token_id=tokenizer.sep_token_id,
    use_cache=True,
    layer_norm_epsilon=1e-5,
    initializer_range=0.02,
    output_attentions=False,
    output_hidden_states=False,
    tie_word_embeddings=True
)
model = GPT2LMHeadModel(config=model_config)
model.set_input_embeddings(model.resize_token_embeddings(len(tokenizer)))

model.save_pretrained('/gdrive/MyDrive/TrenerGpt/model')
tokenizer.save_pretrained('/gdrive/MyDrive/TrenerGpt/model')


Обучения на TPU v.2

In [None]:
import sys
import argparse
import os
import torch
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

class CustomTextDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size):
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        self.examples = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size, truncation=True, padding="max_length")["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

def train_model(index, model, dataset_path, tokenizer, device, batch_size, epochs, model_path):
    dataset = CustomTextDataset(dataset_path, tokenizer, block_size=128)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = AdamW(model.parameters(), lr=5e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(data_loader) * epochs)

    model.train()
    model.to(device)

    for epoch in range(epochs):
        epoch_progress = tqdm(data_loader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch in epoch_progress:
            batch = batch.to(device)
            inputs, labels = batch, batch
            optimizer.zero_grad()
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            xm.optimizer_step(optimizer)
            scheduler.step()
            epoch_progress.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1}/{epochs} completed.")

    if xm.is_master_ordinal():
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_epochs', type=int, default=1) # количество эпох
    parser.add_argument('--dataset_path', type=str, default="/gdrive/MyDrive/TrenerGpt/dataset.txt")
    parser.add_argument('--tpu', action='store_true', default=False) # Используем TPU в качестве устройства
    args, unknown = parser.parse_known_args()

    dataset_path = args.dataset_path
    model_path = "/gdrive/MyDrive/TrenerGpt/model"
    num_epochs = args.num_epochs
    batch_size = 256 # должен учитывать количество ядер процессора и доступную оперативную память

    device = xm.xla_device() if args.tpu else torch.device("cuda" if torch.cuda.is_available() else "cpu")

    try:
        tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        model = GPT2LMHeadModel.from_pretrained(model_path)
    except Exception as e:
        print(f"Error loading model from {model_path}, loading GPT-2 base model instead. Error: {e}")
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2LMHeadModel.from_pretrained('gpt2')

    if args.tpu:
        xmp.spawn(train_model, args=(model, dataset_path, tokenizer, device, batch_size, num_epochs, model_path), nprocs=8, start_method='fork')
    else:
        train_model(0, model, dataset_path, tokenizer, device, batch_size, num_epochs, model_path)

if __name__ == '__main__':
    main()


***Скрипт для проверки ответов.***

In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

class GPT2Generator:
    def __init__(self, model_path):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = GPT2LMHeadModel.from_pretrained(model_path).to(self.device)

    def generate_text(self, input_text, temperature_value, length_value, num_results, no_repeat_ngram_size):
        input_ids = self.tokenizer.encode(input_text, return_tensors='pt').to(self.device)
        attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)

        outputs = self.model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=length_value,
            num_return_sequences=num_results,
            no_repeat_ngram_size=no_repeat_ngram_size,
            repetition_penalty=1.5,
            temperature=temperature_value,
            do_sample=True
        )

        result_text = ""
        for i, output in enumerate(outputs):
            generated_text = self.tokenizer.decode(output, skip_special_tokens=True)
            result_text += f"Результат {i+1}:\n{generated_text}\n\n"

        return result_text

gpt2_generator = GPT2Generator("/gdrive/MyDrive/TrenerGpt/model")
temperature_value = 0.1
length_value = 100
num_results = 1
ngram_value = 2

def generate_text():
    input_text = input("Введи затравку: ")
    result_text = gpt2_generator.generate_text(input_text, temperature_value, length_value, num_results, ngram_value)
    print(result_text)


if __name__ == "__main__":
    while True:
        user_input = input("Выберите действие (1 - сгенерировать текст, 2 - выход): ")
        if user_input == "1":
            generate_text()
        elif user_input == "2":
            break
        else:
            print("Некорректный ввод. Попробуйте снова.")


OSError: Incorrect path_or_model_id: '/gdrive/MyDrive/TrenerGpt/model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.