In [1]:
from transformer.config import FetchFromPretrained as ConfigFromPretrained

In [2]:
from transformer.tokenizer import FetchFromPretrained as TokenizerFromPretrained

In [3]:
from datasets import load_dataset

In [4]:
import torch

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'
device

'cuda'

In [6]:
data = load_dataset('AlekseyKorshuk/books')
data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 741
    })
})

In [7]:
model_ckpt = 'google-bert/bert-base-uncased'

In [8]:
config = ConfigFromPretrained(model_ckpt=model_ckpt).fetch()
config

BertConfig {
  "_name_or_path": "google-bert/bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.39.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [9]:
config.hidden_size = 1024
config.intermediate_size = 128
config.compress_layer_size = 128
config.max_position_embeddings = 256
config.num_attention_heads = 8
config.num_hidden_layers = 8
config.device = device
config.dtype = torch.float32

In [10]:
batch_size = 8

In [11]:
tokenizer = TokenizerFromPretrained(model_ckpt=model_ckpt).fetch()
tokenizer

BertTokenizerFast(name_or_path='google-bert/bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [12]:
import pandas as pd

In [13]:
import matplotlib.pyplot as plt

In [14]:
import seaborn as sns

In [15]:
import numpy as np

In [16]:
import os

In [17]:
import pickle

In [18]:
import random

In [19]:
import re

In [20]:
files_count = len(data['train'])
files_count

741

In [21]:
split_index_1, split_index_2 = int(files_count * 0.8), int(files_count * 0.9)
split_index_1, split_index_2 

(592, 666)

In [22]:
def get_data_from_random_files(split):
    indices = []
    if split == 'train':
        file_id = random.randint(0, split_index_1)
    elif split == 'test':
        file_id = random.randint(split_index_1, split_index_2)
    elif split == 'val':
        file_id = random.randint(split_index_2, files_count)

    file = data['train'][file_id]['text']

    input_ids = tokenizer(file, return_tensors='pt', add_special_tokens=False).input_ids[0]
    if len(input_ids) < config.max_position_embeddings + 2*batch_size:
        return get_data_from_random_files(split)
    return input_ids

In [23]:
def get_batch(split):
    input_ids = get_data_from_random_files(split=split)
    ix = torch.randint(len(input_ids) - config.max_position_embeddings, (batch_size,))
    x = torch.stack([input_ids[i:i + config.max_position_embeddings] for i in ix])
    y = torch.stack([input_ids[i + 1:i + config.max_position_embeddings + 1] for i in ix])
    x, y = x.to(torch.int64).to(device), y.to(torch.int64).to(device)
    return x, y

In [24]:
get_data_from_random_files('train')

Token indices sequence length is longer than the specified maximum sequence length for this model (1337710 > 512). Running this sequence through the model will result in indexing errors


tensor([4656, 2246, 2039,  ..., 2010, 5001, 1012])

In [25]:
get_batch('train')

(tensor([[ 2002,  2921,  2010,  ...,  2071,  4089,  3342],
         [ 1029,  2073,  2079,  ...,  1010,  1998,  6877],
         [ 2320,  2016,  2001,  ...,  1036,  2054,  2038],
         ...,
         [25201,  3507,  2012,  ...,  7753,  3578,  2098],
         [ 7514,  1010,  2079,  ...,  1996,  3096,  1997],
         [ 2010,  2793,  5829,  ..., 15583,  2032,  2005]], device='cuda:0'),
 tensor([[ 2921,  2010,  2132,  ...,  4089,  3342,  2010],
         [ 2073,  2079,  2017,  ...,  1998,  6877,  2071],
         [ 2016,  2001,  2908,  ...,  2054,  2038,  2002],
         ...,
         [ 3507,  2012,  1996,  ...,  3578,  2098,  3174],
         [ 1010,  2079, 15265,  ...,  3096,  1997,  2014],
         [ 2793,  5829,  3435,  ...,  2032,  2005,  7302]], device='cuda:0'))

In [26]:
from transformer.head.text_generator_decoder_only import TextGenerator

In [27]:
generator = TextGenerator(config)

In [28]:
gen = generator.to(device)

In [29]:
eval_iters = 100

In [30]:
eval_iter_ticks = 5

In [31]:
X, Y = get_batch('train')
logits, loss = gen(X, Y, tokenizer)
logits, logits.size(), loss

(tensor([[[-2.9826e-01, -3.3975e-01, -4.1917e-01,  ..., -8.2622e-01,
            9.8238e-01, -6.6574e-02],
          [ 6.7641e-02,  6.0552e-02,  1.7264e-01,  ...,  6.0807e-01,
           -8.3629e-01,  2.7148e-01],
          [-4.9022e-01,  1.3766e+00,  4.4577e-01,  ...,  2.9313e-01,
            8.2492e-01,  1.3530e-01],
          ...,
          [ 7.5198e-01,  6.3796e-02,  1.3477e-01,  ..., -9.1082e-02,
           -2.5998e-01,  2.2136e-02],
          [-1.0625e+00,  1.6012e-01, -4.0552e-01,  ..., -3.6616e-01,
           -7.6066e-01, -3.7680e-01],
          [ 2.5929e-01,  3.8800e-01,  2.4807e-01,  ..., -3.2450e-01,
           -1.6820e-01, -3.9306e-02]],
 
         [[ 2.4795e-02,  5.7552e-01, -2.8587e-01,  ...,  6.7478e-01,
            1.7043e-01,  1.2594e-01],
          [ 6.0724e-02,  5.2643e-01, -2.4513e-01,  ...,  5.3861e-01,
           -3.3589e-01,  6.4324e-01],
          [-1.0408e-01,  1.2433e+00,  2.2106e-01,  ...,  2.8994e-01,
            2.6490e-01, -1.5885e-01],
          ...,
    

In [32]:
pretrained = False

In [33]:
build_pretrained = False

In [34]:
sum(p.numel() for p in gen.parameters())

100893498

In [35]:
def test(prompt='She was going to '):
    prompt_ids = tokenizer(prompt, return_tensors='pt', add_special_tokens=False).input_ids
    generated_ids = gen.generate(prompt_ids.to(device), max_new_tokens=20)
    result = tokenizer.decode(generated_ids[0])
    print(result)

In [36]:
def test_deep(logits, targets):
    # focus only on the last time step
    logits = logits[:, -1, :] # becomes (B, C)
    # apply softmax to obtain probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1).to(config.device) # (B, C)
    index_next = torch.multinomial(probs, num_samples=1) # (B, C)

    print(tokenizer.decode(index_next.squeeze(-1)))
    print(tokenizer.decode(targets[:, -1]))

In [37]:
@torch.no_grad()
def estimate_loss():
    out = {}
    if (not pretrained) or build_pretrained:
        with open('text_generator.pkl', 'wb') as handler:
            pickle.dump(gen, handler)
    gen.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iter_ticks):
            X, Y = get_batch(split)
            logits, loss = gen(X, Y)
            losses[k] = loss.item()
            test_deep(logits, Y)
        out[split] = losses.mean()
    gen.train()
    return out
    

In [38]:
learning_rate = 5e-3

In [39]:
iterations = 1000000

In [40]:
if build_pretrained:
    with open('text_generator.pkl', 'wb') as handler:
        pickle.dump(gen, handler)

In [41]:
if (not pretrained) or build_pretrained:
    # create a PyTorch optimizer
    optimizer = torch.optim.AdamW(gen.parameters(), lr=learning_rate, weight_decay=0.01, amsgrad=True)

    for iter in range(iterations):
        try:
            
            if (iter != 0) and (iter % eval_iters == 0):
                losses = estimate_loss()
                train_loss = losses['train']
                val_loss = losses['val']
                print(f'Loss at step = {iter} for train data is {train_loss:.4f} for val it is {val_loss:.4f}')

            # sample a batch of data
            xb, yb = get_batch('train')

            # evaluate the loss
            logits, loss = gen.forward(xb, yb, tokenizer)

            if not torch.isnan(loss).any():
                test_deep(logits, yb)
                print(iter, loss)
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                # torch.torch.nn.utils.clip_grad_norm_(gen.parameters(), max_norm=15.0)
                optimizer.step()
            else:
                print('NaN loss')
                print(iter, loss)
                input()
        except Exception as e:
            print(e)

preface questioned ware scooped tracy shrinking [unused631] [unused12]
, not okay her with human the he
0 tensor(10.4416, device='cuda:0', grad_fn=<NllLossBackward0>)
.. elbow supermarket. anticipation. the
stab, he glasses got few, where
1 tensor(10.7319, device='cuda:0', grad_fn=<NllLossBackward0>)
四. ས 四 palestinian 1729 exemptrelli
and n are his of so on at
2 tensor(22.7131, device='cuda:0', grad_fn=<NllLossBackward0>)
##sb puresbificationsb although 四sb
watching say nightie crush dart his.
3 tensor(101.2283, device='cuda:0', grad_fn=<NllLossBackward0>)
........
##s. stink'none short was his
4 tensor(1418.4895, device='cuda:0', grad_fn=<NllLossBackward0>)
studied walk walk walk walk walk walk walk
##ro blue... a me against
5 tensor(1421.7917, device='cuda:0', grad_fn=<NllLossBackward0>)
burn burn burn burn burn burn burn burn
opened wolf on smiling spirit t then goose
6 tensor(385.3386, device='cuda:0', grad_fn=<NllLossBackward0>)
walk walk walk walk shirt wanted walk walk
the the,

In [None]:
test('Summer season is upon us and you are all set to head to the beach or to an outdoor event. Outfit. Check, Sunglasses. Check. Hat. Check. Sunscreen? Well, check the ingredients before applying it. While sunscreen is essential for protecting your skin')

In [None]:
test('An apple is')

In [None]:
test('This was the greatest outcome for me because')

In [None]:
sum(p.numel() for p in gen.parameters())

In [None]:
prompt = 'They wanted to fight and '
test(prompt)

In [None]:
prompt = 'He was walking on the middle of road when a car started coming at high speed behind him '
test(prompt)

In [None]:
if (not pretrained) or build_pretrained:
    with open('text_generator.pkl', 'wb') as handler:
        pickle.dump(gen, handler)

In [None]:
with open('text_generator.pkl', 'rb') as handler:
    gen = pickle.load(handler)

In [None]:
if (not pretrained) or build_pretrained:
    # create a PyTorch optimizer
    # optimizer = torch.optim.AdamW(gen.parameters(), lr=learning_rate, weight_decay=0.01, amsgrad=True)

    for iter in range(iterations):
        try:
            
            if (iter != 0) and (iter % eval_iters == 0):
                losses = estimate_loss()
                train_loss = losses['train']
                val_loss = losses['val']
                print(f'Loss at step = {iter} for train data is {train_loss:.4f} for val it is {val_loss:.4f}')

            # sample a batch of data
            xb, yb = get_batch('train')

            # evaluate the loss
            logits, loss = gen.forward(xb, yb, tokenizer)
            if not torch.isnan(loss).any():
                test_deep(logits, yb)
                print(iter, loss)
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                # torch.torch.nn.utils.clip_grad_norm_(gen.parameters(), max_norm=15.0)
                optimizer.step()
            else:
                print('NaN loss')
                print(iter, loss)
                input()
        except Exception as e:
            print(e)