In [90]:
#!g1.1
from tqdm.auto import tqdm
import json
import os
import numpy as np
from sentencepiece import SentencePieceTrainer, SentencePieceProcessor
import os
import torch
from torch.utils.data import Dataset
from typing import Union, List, Tuple
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
#!g1.1
# input_dir = '/kaggle/input/bdz-jsons/'
# working_dir = '/kaggle/working/'
input_dir = ''
working_dir = ''

file_names = os.listdir(input_dir)
with open('all_texts.txt', 'w') as out:
#     for i in tqdm(range(30)):
#     for i in tqdm(range(len(file_names))):
    for i in tqdm(range(len(file_names))):
        file_name = file_names[i]
        with open(input_dir + file_name, 'r') as input:
            data = json.load(input)
        for tex in data:
            text = tex['story'].replace('\n', ' ').replace('\\n', ' ')
            if len(text):
                print(text, file=out, end='\n')

In [91]:
#!g1.1
class TextDataset(Dataset):
    def __init__(self, data_file: str, train: bool = True, sp_model_prefix: str = None,
                 vocab_size: int = 2000, normalization_rule_name: str = 'nmt_nfkc_cf',
                 model_type: str = 'bpe', max_length: int = 256):
        if not os.path.isfile(sp_model_prefix + '.model'):
            SentencePieceTrainer.train(
                input=data_file, vocab_size=vocab_size,
                model_type=model_type, model_prefix=sp_model_prefix,
                normalization_rule_name=normalization_rule_name,
                pad_id=3
            )

        self.sp_model = SentencePieceProcessor(model_file=sp_model_prefix + '.model')

        with open(data_file) as file:
            texts = file.readlines()

        self.texts = texts

        self.pad_id, self.unk_id, self.bos_id, self.eos_id = \
            self.sp_model.pad_id(), self.sp_model.unk_id(), \
            self.sp_model.bos_id(), self.sp_model.eos_id()

        self.max_length = max_length
        self.vocab_size = self.sp_model.vocab_size()

    def text2ids(self, texts: Union[str, List[str]]) -> Union[List[int], List[List[int]]]:
        return self.sp_model.encode(texts)

    def ids2text(self, ids: Union[torch.Tensor, List[int], List[List[int]]]) -> Union[str, List[str]]:
        if torch.is_tensor(ids):
            assert len(ids.shape) <= 2, 'Expected tensor of shape (length, ) or (batch_size, length)'
            ids = ids.cpu().tolist()

        return self.sp_model.decode(ids)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item: int) -> Tuple[torch.Tensor, int]:
        ids = self.text2ids(self.texts[item].strip())
        encoded = ids[:min(len(ids), self.max_length - 2)]
        encoded = [self.bos_id] + encoded + [self.eos_id]
        padded = torch.full((self.max_length,), self.pad_id, dtype=torch.int64)
        padded[:len(encoded)] = torch.tensor(encoded)
        return padded, len(encoded)

In [92]:
#!g1.1
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda
cuda


In [93]:
#!g1.1
import math
import os

import torch
from torch import Tensor, nn
from torch.distributions import Categorical
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.optim.lr_scheduler import _LRScheduler

device = torch.device("cuda")


class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer("pe", pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:, : x.size(1)]
        return self.dropout(x)


class Model(nn.Module):
    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.0):
        super().__init__()
        self.model_type = "Transformer"
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, activation="gelu", batch_first=True)

        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)
        self.dummy_param = nn.Parameter(torch.empty(0))

        self.init_weights()
        

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(
            self, src: Tensor, src_mask: Tensor = None, src_key_padding_mask: Tensor = None
    ) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[batch_size, seq_len]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[batch_size, seq_len, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = (nn.Transformer.generate_square_subsequent_mask(src.shape[1]).to(device).isinf())
            
        output = self.transformer_encoder(src,mask=src_mask,)
        output = self.linear(output)
        return output

In [94]:
#!g1.1
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda
cuda


In [95]:
#!g1.1
dataset = TextDataset(data_file="all_texts.txt", sp_model_prefix="bpe", vocab_size=5000, max_length=256)
train_loader = DataLoader(dataset, batch_size=768, shuffle=True, num_workers=5)

In [96]:
#!g1.1
model = Model(ntoken=dataset.sp_model.vocab_size(), d_model=256, nhead=4, 
              d_hid=256, nlayers=4,)
model.to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=dataset.sp_model.pad_id())
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=5e-4,
    betas=(0.9, 0.95),
    weight_decay=0.1,
)

In [97]:
#!g1.1
num_epochs = 4

for epoch in range(1, num_epochs + 1):
    train_loss = 0.0

    model.train()
    optimizer.zero_grad()
    for indices, lengths in tqdm(train_loader):
        tokens = indices[:, :lengths.max()].to(device)
        optimizer.zero_grad()
#         with torch.autocast(device_type="cuda", dtype=torch.bfloat16): 
        logits = model(tokens[:, :-1])
        loss = criterion(logits.transpose(1, 2), tokens[:, 1:])
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * tokens.shape[0]

    train_loss /= len(train_loader.dataset)
    
    print('Train loss =', train_loss, 'on ep :', epoch)

  0%|          | 0/6469 [00:00<?, ?it/s]

NameError: name 'loader' is not defined

In [99]:
#!g1.1
train_loss /= len(train_loader.dataset)

print('Train loss =', train_loss, 'on ep :', epoch)

Train loss = 1.8136762476889643 on ep : 1


In [None]:
#!g1.1
# Далее код не сохранялся

In [None]:
#!g1.1
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2-xl')
set_seed(42)

In [None]:
#!g1.1
def arr_(dict_):
    ans = []
    for el in dict_:
        ans.append(el['generated_text'])
    return ans

In [None]:
#!g1.1
arr_(generator("Once upon a time, in an ancient house, there lived a girl named Lily. She loved to decorate her room with pretty things. One day, she found a big box in the attic. She opened it and saw many shiny decorations. Lily was very happy and decided to use them in her room.As Lily was decorating her room, the sky outside became dark. There was a loud",
               max_length=500,
               num_return_sequences=1))

In [None]:
#!g1.1
def inference(model, tokenizer, max_length=20, rep = 3, prefix=''):
    model.eval()
    anses = []
    for _ in range(rep):
        sent_tokens = torch.tensor([[tokenizer.bos_id()] + tokenizer.encode(prefix)])
        sent_tokens = sent_tokens.to(device)
        for _ in range(max_length):
            logits = model(sent_tokens)
            token = Categorical(logits=logits[0, -1]).sample()
            token = token.unsqueeze(0).unsqueeze(0)
            if token.item() == tokenizer.eos_id():
                break
            sent_tokens = torch.cat([sent_tokens, token], axis=1)
        ans = tokenizer.decode(sent_tokens.squeeze().tolist())
        anses.append(ans)
    return anses

In [None]:
#!g1.1
inference(model, dataset.sp_model, max_length=15, rep = 3, prefix="Boy played with his dog")

In [None]:
#!g1.1
