In [1]:
from transformer.config import FetchFromPretrained as ConfigFromPretrained

In [2]:
from transformer.tokenizer import FetchFromPretrained as TokenizerFromPretrained

In [3]:
from datasets import load_dataset

In [4]:
import torch

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
device

'cpu'

In [6]:
data = load_dataset('AlekseyKorshuk/books')
data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 741
    })
})

In [7]:
model_ckpt = 'bert-base-uncased'

In [8]:
config = ConfigFromPretrained(model_ckpt=model_ckpt).fetch()
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.39.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [9]:
config.hidden_size = 128
config.intermediate_size = 1024 * 2
config.max_position_embeddings = 128
config.num_attention_heads = 64
config.num_hidden_layers = 16
config.device = device
config.dtype = torch.float32

In [10]:
batch_size = 8

In [11]:
tokenizer = TokenizerFromPretrained(model_ckpt=model_ckpt).fetch()
tokenizer

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [12]:
import pandas as pd

In [13]:
import matplotlib.pyplot as plt

In [14]:
import seaborn as sns

In [15]:
import numpy as np

In [16]:
import os

In [17]:
import pickle

In [18]:
import random

In [19]:
import re

In [20]:
files_count = len(data['train'])
files_count

741

In [21]:
split_index_1, split_index_2 = int(files_count * 0.8), int(files_count * 0.9)
split_index_1, split_index_2 

(592, 666)

In [22]:
def get_data_from_random_files(split):
    indices = []
    if split == 'train':
        file_id = random.randint(0, split_index_1)
    elif split == 'test':
        file_id = random.randint(split_index_1, split_index_2)
    elif split == 'val':
        file_id = random.randint(split_index_2, files_count)

    file = data['train'][file_id]['text']

    input_ids = tokenizer(file, return_tensors='pt', add_special_tokens=False).input_ids[0]
    return input_ids

In [23]:
def get_batch(split):
    input_ids = get_data_from_random_files(split=split)
    ix = torch.randint(len(input_ids) - config.max_position_embeddings, (batch_size,))
    x = torch.stack([input_ids[i:i + config.max_position_embeddings] for i in ix])
    y = torch.stack([input_ids[i + 1:i + config.max_position_embeddings + 1] for i in ix])
    x, y = x.to(torch.int64).to(device), y.to(torch.int64).to(device)
    return x, y

In [24]:
get_data_from_random_files('train')

Token indices sequence length is longer than the specified maximum sequence length for this model (1354976 > 512). Running this sequence through the model will result in indexing errors


tensor([1036, 1036, 3198,  ..., 2086, 3283, 1029])

In [25]:
get_batch('train')

(tensor([[ 1996,  2540,  3145,  ..., 25933, 10270,  2271],
         [25070,  1051, 18153,  ..., 10047,  9524,  9336],
         [ 1996,  2034,  2000,  ...,  1996,  3492,  2611],
         ...,
         [ 2033,  2046,  7276,  ...,  2449,  2203,  1997],
         [ 1045,  3631,  1037,  ...,  2793,  2003,  4803],
         [ 1996,  2273,  2007,  ...,  2018, 19461, 27586]]),
 tensor([[ 2540,  3145,   999,  ..., 10270,  2271,  3368],
         [ 1051, 18153,  2063,  ...,  9524,  9336,  2102],
         [ 2034,  2000,  2360,  ...,  3492,  2611,  1012],
         ...,
         [ 2046,  7276,  4465,  ...,  2203,  1997,  2026],
         [ 3631,  1037, 19395,  ...,  2003,  4803,   999],
         [ 2273,  2007,  2032,  ..., 19461, 27586,  2389]]))

In [26]:
from transformer.head.text_generator_decoder_only import TextGenerator

In [27]:
generator = TextGenerator(config)

In [28]:
gen = generator.to(device)

In [29]:
eval_iters = 100

In [30]:
eval_iter_ticks = 5

In [31]:
X, Y = get_batch('train')
logits, loss = gen(X, Y, tokenizer)
logits, logits.size(), loss

##hor insisted hardin toni keeps translitchaft hawthorne
##thi the had.. that deadly could


(tensor([[-0.6201, -0.4167, -0.1482,  ...,  0.5508,  0.2300, -0.2200],
         [-0.3544, -1.0222, -0.8640,  ...,  0.3704,  0.6170, -0.3816],
         [ 0.6972,  0.4416,  0.2256,  ..., -0.3638,  0.5547, -0.8706],
         ...,
         [ 0.0706, -0.2483, -0.4540,  ..., -0.1054,  0.1064, -0.9024],
         [ 1.1845, -0.8941,  0.3462,  ..., -0.6019,  1.3436, -0.7249],
         [ 0.4633, -0.0675,  0.4198,  ...,  0.2016,  1.2000, -0.5350]],
        grad_fn=<SliceBackward0>),
 torch.Size([8, 30522]),
 tensor(10.4464, grad_fn=<NllLossBackward0>))

In [32]:
pretrained = False

In [33]:
build_pretrained = False

In [34]:
@torch.no_grad()
def estimate_loss():
    out = {}
    if (not pretrained) or build_pretrained:
        with open('text_generator.pkl', 'wb') as handler:
            pickle.dump(gen, handler)
    gen.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iter_ticks):
            X, Y = get_batch(split)
            logits, loss = gen(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    gen.train()
    return out
    

In [35]:
learning_rate = 5e-4

In [36]:
iterations = 10000

In [37]:
if (not pretrained) or build_pretrained:
    # create a PyTorch optimizer
    optimizer = torch.optim.AdamW(gen.parameters(), lr=learning_rate, weight_decay=0.01, amsgrad=True)

    for iter in range(iterations):
        try:
            
            if (iter != 0) and (iter % eval_iters == 0):
                losses = estimate_loss()
                train_loss = losses['train']
                val_loss = losses['val']
                print(f'Loss at step = {iter} for train data is {train_loss:.4f} for val it is {val_loss:.4f}')

            # sample a batch of data
            xb, yb = get_batch('train')

            # evaluate the loss
            logits, loss = gen.forward(xb, yb, tokenizer)
            if not torch.isnan(loss).any():
                print(iter, loss)
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                torch.torch.nn.utils.clip_grad_norm_(gen.parameters(), max_norm=15.0)
                optimizer.step()
            else:
                print('NaN loss')
                print(iter, loss)
                input()
        except Exception as e:
            print(e)

lorraine jorge twenty20ospherepg death vishnulogists
, been scent dusty into, for,
0 tensor(10.4765, grad_fn=<NllLossBackward0>)
zenith brokenweekdae midday futures lame raked
inter on he work human n coming brought
1 tensor(10.1642, grad_fn=<NllLossBackward0>)
##heater ɫ cocaine 葉 [unused431]rco vt ruler
exclusive's her a,. -
2 tensor(9.6891, grad_fn=<NllLossBackward0>)
##taking until joanna pacific lunged lame enterprisescey
. floor if good,ion starbucks t
3 tensor(9.7375, grad_fn=<NllLossBackward0>)
##np transportation creaked あ 139 yokohama'prophecy
other the grow that raked minutes'covered
4 tensor(9.8085, grad_fn=<NllLossBackward0>)
biketore goldstein freighterrove bayern tailored shaft
mo you straight handle the for then next
5 tensor(9.8376, grad_fn=<NllLossBackward0>)
46th arteries jonas 2000 piazza theorem 1999.
, family princess to a cavern i alone
6 tensor(9.9625, grad_fn=<NllLossBackward0>)
##wn married underneath davey mocked attire ranchdhi
no assess she addressing was a

: 

In [None]:
if (not pretrained) or build_pretrained:
    # create a PyTorch optimizer
    # optimizer = torch.optim.AdamW(gen.parameters(), lr=learning_rate, weight_decay=0.01, amsgrad=True)

    for iter in range(iterations):
        try:
            
            if (iter != 0) and (iter % eval_iters == 0):
                losses = estimate_loss()
                train_loss = losses['train']
                val_loss = losses['val']
                print(f'Loss at step = {iter} for train data is {train_loss:.4f} for val it is {val_loss:.4f}')

            # sample a batch of data
            xb, yb = get_batch('train')

            # evaluate the loss
            logits, loss = gen.forward(xb, yb, tokenizer)
            if not torch.isnan(loss).any():
                print(iter, loss)
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                torch.torch.nn.utils.clip_grad_norm_(gen.parameters(), max_norm=15.0)
                optimizer.step()
            else:
                print('NaN loss')
                print(iter, loss)
                input()
        except:
            pass

In [None]:
if (not pretrained) or build_pretrained:
    with open('text_generator.pkl', 'wb') as handler:
        pickle.dump(gen, handler)

In [None]:
if pretrained:
    with open('text_generator.pkl', 'rb') as handler:
        gen = pickle.load(handler)

In [None]:
prompt = 'She was going to do it, but'

In [None]:
prompt_ids = tokenizer(prompt, return_tensors='pt', add_special_tokens=False).input_ids
prompt_ids

In [None]:
generated_ids = gen.generate(prompt_ids.to(device), max_new_tokens=10)
generated_ids


In [None]:
tokenizer.decode(generated_ids[0])