In [1]:
from transformer.config import FetchFromPretrained as ConfigFromPretrained

In [2]:
from transformer.tokenizer import FetchFromPretrained as TokenizerFromPretrained

In [3]:
from datasets import load_dataset

In [4]:
import torch

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
device

'cpu'

In [6]:
data = load_dataset('AlekseyKorshuk/books')
data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 741
    })
})

In [7]:
model_ckpt = 'bert-base-uncased'

In [8]:
config = ConfigFromPretrained(model_ckpt=model_ckpt).fetch()
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.39.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [9]:
config.hidden_size = 128
config.intermediate_size = 1024 
config.compress_layer_size = 128
config.max_position_embeddings = 128
config.num_attention_heads = 64
config.num_hidden_layers = 8
config.device = device
config.dtype = torch.float32

In [10]:
batch_size = 8

In [11]:
tokenizer = TokenizerFromPretrained(model_ckpt=model_ckpt).fetch()
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [12]:
import pandas as pd

In [13]:
import matplotlib.pyplot as plt

In [14]:
import seaborn as sns

In [15]:
import numpy as np

In [16]:
import os

In [17]:
import pickle

In [18]:
import random

In [19]:
import re

In [20]:
files_count = len(data['train'])
files_count

741

In [21]:
split_index_1, split_index_2 = int(files_count * 0.8), int(files_count * 0.9)
split_index_1, split_index_2 

(592, 666)

In [22]:
def get_data_from_random_files(split):
    indices = []
    if split == 'train':
        file_id = random.randint(0, split_index_1)
    elif split == 'test':
        file_id = random.randint(split_index_1, split_index_2)
    elif split == 'val':
        file_id = random.randint(split_index_2, files_count)

    file = data['train'][file_id]['text']

    input_ids = tokenizer(file, return_tensors='pt', add_special_tokens=False).input_ids[0]
    return input_ids

In [23]:
def get_batch(split):
    input_ids = get_data_from_random_files(split=split)
    ix = torch.randint(len(input_ids) - config.max_position_embeddings, (batch_size,))
    x = torch.stack([input_ids[i:i + config.max_position_embeddings] for i in ix])
    y = torch.stack([input_ids[i + 1:i + config.max_position_embeddings + 1] for i in ix])
    x, y = x.to(torch.int64).to(device), y.to(torch.int64).to(device)
    return x, y

In [24]:
get_data_from_random_files('train')

Token indices sequence length is longer than the specified maximum sequence length for this model (1469867 > 512). Running this sequence through the model will result in indexing errors


tensor([1036, 1036, 2748,  ..., 2026, 3566, 1012])

In [25]:
get_batch('train')

(tensor([[ 1029,  1005,  1005,  ...,  1012,  1036,  1036],
         [ 2007,  2033,  2005,  ...,  1012,  2008,  1005],
         [ 1997,  1996,  2300,  ...,  1012,  1996, 18499],
         ...,
         [ 2524, 15113, 14855,  ...,  1025,  1045,  3140],
         [ 1036,  1045,  2097,  ...,  2052,  2963,  1012],
         [ 1036,  1036,  2017,  ...,  3961,  4333,  2055]]),
 tensor([[ 1005,  1005,  7733,  ...,  1036,  1036,  2272],
         [ 2033,  2005,  1037,  ...,  2008,  1005,  1055],
         [ 1996,  2300,  1012,  ...,  1996, 18499,  6873],
         ...,
         [15113, 14855, 23200,  ...,  1045,  3140,  2009],
         [ 1045,  2097,  2156,  ...,  2963,  1012,  2021],
         [ 1036,  2017,  2024,  ...,  4333,  2055,  2009]]))

In [26]:
from transformer.head.text_generator_decoder_only import TextGenerator

In [27]:
generator = TextGenerator(config)

In [28]:
gen = generator.to(device)

In [29]:
eval_iters = 100

In [30]:
eval_iter_ticks = 5

In [31]:
X, Y = get_batch('train')
logits, loss = gen(X, Y, tokenizer)
logits, logits.size(), loss

[unused823] alexandreifying receptor nearing prairie urge mathews
re ran. say'texted of talk


(tensor([[-0.8650, -0.5177,  0.1017,  ...,  0.6171,  0.1362,  0.3305],
         [-0.1632, -0.7614,  0.4080,  ...,  0.6095,  0.4628,  0.2499],
         [ 0.6692, -0.2312, -0.4737,  ..., -0.0150, -0.3001, -0.0529],
         ...,
         [-0.1580,  0.4409, -0.2210,  ..., -0.0432, -0.1676, -0.0061],
         [ 0.0568, -0.3153, -0.2447,  ..., -0.5499, -0.4244, -0.5407],
         [ 0.5821,  0.2160, -0.8159,  ...,  1.3224,  0.9266,  0.4906]],
        grad_fn=<SliceBackward0>),
 torch.Size([8, 30522]),
 tensor(10.4422, grad_fn=<NllLossBackward0>))

In [32]:
pretrained = False

In [33]:
build_pretrained = False

In [34]:
@torch.no_grad()
def estimate_loss():
    out = {}
    if (not pretrained) or build_pretrained:
        with open('text_generator.pkl', 'wb') as handler:
            pickle.dump(gen, handler)
    gen.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iter_ticks):
            X, Y = get_batch(split)
            logits, loss = gen(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    gen.train()
    return out
    

In [35]:
learning_rate = 5e-4

In [36]:
iterations = 10000

In [37]:
if (not pretrained) or build_pretrained:
    # create a PyTorch optimizer
    optimizer = torch.optim.AdamW(gen.parameters(), lr=learning_rate, weight_decay=0.01, amsgrad=True)

    for iter in range(iterations):
        try:
            
            if (iter != 0) and (iter % eval_iters == 0):
                losses = estimate_loss()
                train_loss = losses['train']
                val_loss = losses['val']
                print(f'Loss at step = {iter} for train data is {train_loss:.4f} for val it is {val_loss:.4f}')

            # sample a batch of data
            xb, yb = get_batch('train')

            # evaluate the loss
            logits, loss = gen.forward(xb, yb, tokenizer)
            if not torch.isnan(loss).any():
                print(iter, loss)
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                torch.torch.nn.utils.clip_grad_norm_(gen.parameters(), max_norm=15.0)
                optimizer.step()
            else:
                print('NaN loss')
                print(iter, loss)
                input()
        except Exception as e:
            print(e)

stormy nueva entry rex relationship hua [unused10] waterfall
d eleven. have. k, wrong
0 tensor(10.4215, grad_fn=<NllLossBackward0>)
2011 cerebral specify crashed lineman dismissed 130 storm
.'* instructions eyes apparently new.
1 tensor(10.3404, grad_fn=<NllLossBackward0>)
stove amount國 attorney provide blandpd gazing
for of car toning behind. be
2 tensor(10.3400, grad_fn=<NllLossBackward0>)
politician shuffled [unused894]haven preference calcium heated andhra
n either.. it you s -
3 tensor(10.1304, grad_fn=<NllLossBackward0>)
converting sequential mercenary prophet project buckingham curatedibar
on with forbel things'; over
4 tensor(10.0296, grad_fn=<NllLossBackward0>)
cai glitteringjust photos sails food weaken within
i curtain to, entire ` okay do
5 tensor(9.7554, grad_fn=<NllLossBackward0>)
##bbing beard boredomholz plightcards aliens mechanics
don remembering have second a two already cycle
6 tensor(9.7585, grad_fn=<NllLossBackward0>)
theta emptiness manifestation ed berwick 分 wip

In [38]:
if (not pretrained) or build_pretrained:
    # create a PyTorch optimizer
    # optimizer = torch.optim.AdamW(gen.parameters(), lr=learning_rate, weight_decay=0.01, amsgrad=True)

    for iter in range(iterations):
        try:
            
            if (iter != 0) and (iter % eval_iters == 0):
                losses = estimate_loss()
                train_loss = losses['train']
                val_loss = losses['val']
                print(f'Loss at step = {iter} for train data is {train_loss:.4f} for val it is {val_loss:.4f}')

            # sample a batch of data
            xb, yb = get_batch('train')

            # evaluate the loss
            logits, loss = gen.forward(xb, yb, tokenizer)
            if not torch.isnan(loss).any():
                print(iter, loss)
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                torch.torch.nn.utils.clip_grad_norm_(gen.parameters(), max_norm=15.0)
                optimizer.step()
            else:
                print('NaN loss')
                print(iter, loss)
                input()
        except:
            pass

by his robyn. of second ',
in you his kissing the narrow '.
0 tensor(4.8115, grad_fn=<NllLossBackward0>)
heard the syam come then that must
to so sy cy and in are
1 tensor(5.2469, grad_fn=<NllLossBackward0>)
distance ` ofgging him on on blood
wonderful when for murmur the and his swift
2 tensor(5.3964, grad_fn=<NllLossBackward0>)
the him took little ` '. fingers
then'come laugh ` '. head
3 tensor(4.4661, grad_fn=<NllLossBackward0>)
unable flying right attached to regardless, did
and after then to to but. said
4 tensor(5.2018, grad_fn=<NllLossBackward0>)
green'oh exactlys she ` who
symbol'but hesitated asked his `s
5 tensor(4.4754, grad_fn=<NllLossBackward0>)
' beside need. but my called lot
but to object.'maybe taken shower
6 tensor(4.0322, grad_fn=<NllLossBackward0>)
. it it time almost room her,
pillars the some rubbed best med you to
7 tensor(5.6203, grad_fn=<NllLossBackward0>)
where the its of now of in and
least the he. already of along,
8 tensor(5.4926, grad_fn=<NllLossBackward0>

In [None]:
if (not pretrained) or build_pretrained:
    with open('text_generator.pkl', 'wb') as handler:
        pickle.dump(gen, handler)

In [None]:
if pretrained:
    with open('text_generator.pkl', 'rb') as handler:
        gen = pickle.load(handler)

In [None]:
prompt = 'She was going to do it, but'

In [None]:
prompt_ids = tokenizer(prompt, return_tensors='pt', add_special_tokens=False).input_ids
prompt_ids

In [None]:
generated_ids = gen.generate(prompt_ids.to(device), max_new_tokens=10)
generated_ids


In [None]:
tokenizer.decode(generated_ids[0])