In [1]:
from transformer.config import FetchFromPretrained as ConfigFromPretrained

In [2]:
from transformer.tokenizer import FetchFromPretrained as TokenizerFromPretrained

In [3]:
from datasets import load_dataset

In [4]:
import torch

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# device = 'cpu'
device

'cuda'

In [6]:
data = load_dataset('AlekseyKorshuk/books')
data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 741
    })
})

In [7]:
model_ckpt = 'google-bert/bert-base-uncased'

In [8]:
config = ConfigFromPretrained(model_ckpt=model_ckpt).fetch()
config

BertConfig {
  "_name_or_path": "google-bert/bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.39.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [9]:
config.hidden_size = 1024
config.intermediate_size = 64
config.compress_layer_size = 64
config.max_position_embeddings = 128
config.num_attention_heads = 8
config.num_hidden_layers = 8
config.device = device
config.dtype = torch.float32

In [10]:
batch_size = 8

In [11]:
tokenizer = TokenizerFromPretrained(model_ckpt=model_ckpt).fetch()
tokenizer

BertTokenizerFast(name_or_path='google-bert/bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [12]:
import pandas as pd

In [13]:
import matplotlib.pyplot as plt

In [14]:
import seaborn as sns

In [15]:
import numpy as np

In [16]:
import os

In [17]:
import pickle

In [18]:
import random

In [19]:
import re

In [20]:
files_count = len(data['train'])
files_count

741

In [21]:
split_index_1, split_index_2 = int(files_count * 0.8), int(files_count * 0.9)
split_index_1, split_index_2 

(592, 666)

In [22]:
def get_data_from_random_files(split):
    indices = []
    if split == 'train':
        file_id = random.randint(0, split_index_1)
    elif split == 'test':
        file_id = random.randint(split_index_1, split_index_2)
    elif split == 'val':
        file_id = random.randint(split_index_2, files_count)

    file = data['train'][file_id]['text']

    input_ids = tokenizer(file, return_tensors='pt', add_special_tokens=False).input_ids[0]
    if len(input_ids) < config.max_position_embeddings + 2*batch_size:
        return get_data_from_random_files(split)
    return input_ids

In [23]:
def get_batch(split):
    input_ids = get_data_from_random_files(split=split)
    ix = torch.randint(len(input_ids) - config.max_position_embeddings, (batch_size,))
    x = torch.stack([input_ids[i:i + config.max_position_embeddings] for i in ix])
    y = torch.stack([input_ids[i + 1:i + config.max_position_embeddings + 1] for i in ix])
    x, y = x.to(torch.int64).to(device), y.to(torch.int64).to(device)
    return x, y

In [24]:
get_data_from_random_files('train')

Token indices sequence length is longer than the specified maximum sequence length for this model (1524018 > 512). Running this sequence through the model will result in indexing errors


tensor([1045, 2156, 2054,  ..., 1996, 3409, 1012])

In [25]:
get_batch('train')

(tensor([[ 5138,  2256, 15961,  ...,  2068,  1012,  1996],
         [ 4932,  2077,  2002,  ..., 12707,  1059, 29602],
         [ 2172,  1010,  2339,  ...,  2057,  2089,  2196],
         ...,
         [ 9496,  5267,  1997,  ..., 20682,  2008, 25407],
         [ 4452,  2025,  1012,  ...,  2492,  2000,  2108],
         [13550,  1999,  2026,  ...,  2151,  2204, 18971]], device='cuda:0'),
 tensor([[ 2256, 15961,  1012,  ...,  1012,  1996, 12168],
         [ 2077,  2002,  2018,  ...,  1059, 29602,  2140],
         [ 1010,  2339,  2123,  ...,  2089,  2196,  2113],
         ...,
         [ 5267,  1997,  2335,  ...,  2008, 25407,  2068],
         [ 2025,  1012,  1046,  ...,  2000,  2108,  7463],
         [ 1999,  2026,  4540,  ...,  2204, 18971,  2000]], device='cuda:0'))

In [26]:
from transformer.head.text_generator_decoder_only import TextGenerator

In [27]:
generator = TextGenerator(config)

In [28]:
gen = generator.to(device)

In [29]:
eval_iters = 100

In [30]:
eval_iter_ticks = 5

In [31]:
X, Y = get_batch('train')
logits, loss = gen(X, Y, tokenizer)
logits, logits.size(), loss

(tensor([[[ 1.7339e-01, -1.3702e-01,  5.3929e-01,  ..., -2.3304e-01,
            2.0232e-01, -6.9563e-01],
          [ 6.3313e-01,  3.0445e-01, -1.2546e+00,  ...,  5.2929e-01,
            4.2256e-01,  1.9288e-01],
          [ 7.4196e-02, -3.1074e-01,  4.6293e-02,  ...,  4.9908e-02,
            4.1873e-01,  6.5812e-01],
          ...,
          [-8.5314e-01,  1.7837e-01, -4.8513e-01,  ..., -3.3449e-02,
            6.3130e-01, -2.2827e-01],
          [ 4.8797e-02, -2.2342e-02,  9.1830e-01,  ..., -3.7052e-02,
            1.9870e-01, -1.0240e+00],
          [-6.6562e-01, -5.8883e-01, -1.1472e+00,  ...,  1.7549e-02,
           -8.2376e-02,  1.9792e-01]],
 
         [[-1.4724e-01, -9.3736e-01, -1.2256e-02,  ..., -9.7147e-01,
            1.4565e-01, -4.7422e-01],
          [ 6.9703e-01,  7.7893e-02, -4.2811e-01,  ...,  1.0970e+00,
            2.9015e-01, -3.1483e-01],
          [-9.2293e-01,  5.8921e-01, -2.9144e-01,  ...,  7.0853e-01,
            1.8725e-01,  1.0020e+00],
          ...,
    

In [32]:
pretrained = False

In [33]:
build_pretrained = False

In [34]:
sum(p.numel() for p in gen.parameters())

99515706

In [35]:
def test(prompt='She was going to '):
    prompt_ids = tokenizer(prompt, return_tensors='pt', add_special_tokens=False).input_ids
    generated_ids = gen.generate(prompt_ids.to(device), max_new_tokens=20)
    result = tokenizer.decode(generated_ids[0])
    print(result)

In [36]:
def test_deep(logits, targets):
    # focus only on the last time step
    logits = logits[:, -1, :] # becomes (B, C)
    # apply softmax to obtain probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1).to(config.device) # (B, C)
    index_next = torch.multinomial(probs, num_samples=1) # (B, C)

    print(tokenizer.decode(index_next.squeeze(-1)))
    print(tokenizer.decode(targets[:, -1]))

In [37]:
@torch.no_grad()
def estimate_loss():
    out = {}
    if (not pretrained) or build_pretrained:
        with open('text_generator.pkl', 'wb') as handler:
            pickle.dump(gen, handler)
    gen.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iter_ticks):
            X, Y = get_batch(split)
            logits, loss = gen(X, Y)
            losses[k] = loss.item()
            test_deep(logits, Y)
        out[split] = losses.mean()
    gen.train()
    return out
    

In [38]:
learning_rate = 5e-3

In [39]:
iterations = 1000000

In [40]:
if build_pretrained:
    with open('text_generator.pkl', 'wb') as handler:
        pickle.dump(gen, handler)

In [41]:
if (not pretrained) or build_pretrained:
    # create a PyTorch optimizer
    optimizer = torch.optim.AdamW(gen.parameters(), lr=learning_rate, weight_decay=0.01, amsgrad=True)

    for iter in range(iterations):
        try:
            
            if (iter != 0) and (iter % eval_iters == 0):
                losses = estimate_loss()
                train_loss = losses['train']
                val_loss = losses['val']
                print(f'Loss at step = {iter} for train data is {train_loss:.4f} for val it is {val_loss:.4f}')

            # sample a batch of data
            xb, yb = get_batch('train')

            # evaluate the loss
            logits, loss = gen.forward(xb, yb, tokenizer)

            if not torch.isnan(loss).any():
                test_deep(logits, yb)
                print(iter, loss)
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                # torch.torch.nn.utils.clip_grad_norm_(gen.parameters(), max_norm=15.0)
                optimizer.step()
            else:
                print('NaN loss')
                print(iter, loss)
                input()
        except Exception as e:
            print(e)

##leteuf plaza learned digit missile zeke appellate
yelled her lethal steep who, of.
0 tensor(10.4858, device='cuda:0', grad_fn=<NllLossBackward0>)
'charged the.. the..
brianending open logical well from taken,
1 tensor(13.5196, device='cuda:0', grad_fn=<NllLossBackward0>)
tariff umar, reaper the [unused821] the the
, he is shay a n her help
2 tensor(15.0619, device='cuda:0', grad_fn=<NllLossBackward0>)
,,,,,,,,
woman sea, unfortunately feel'squeezed past
3 tensor(62.4383, device='cuda:0', grad_fn=<NllLossBackward0>)
months arrived arrived arrived arrived arrived arrived months
. class that beyond'screaming be layer
4 tensor(188.3927, device='cuda:0', grad_fn=<NllLossBackward0>)


In [None]:
test('Summer season is upon us and you are all set to head to the beach or to an outdoor event. Outfit. Check, Sunglasses. Check. Hat. Check. Sunscreen? Well, check the ingredients before applying it. While sunscreen is essential for protecting your skin')

In [None]:
test('An apple is')

In [None]:
test('This was the greatest outcome for me because')

In [None]:
sum(p.numel() for p in gen.parameters())

In [None]:
prompt = 'They wanted to fight and '
test(prompt)

In [None]:
prompt = 'He was walking on the middle of road when a car started coming at high speed behind him '
test(prompt)

In [None]:
if (not pretrained) or build_pretrained:
    with open('text_generator.pkl', 'wb') as handler:
        pickle.dump(gen, handler)

In [None]:
with open('text_generator.pkl', 'rb') as handler:
    gen = pickle.load(handler)

In [None]:
if (not pretrained) or build_pretrained:
    # create a PyTorch optimizer
    # optimizer = torch.optim.AdamW(gen.parameters(), lr=learning_rate, weight_decay=0.01, amsgrad=True)

    for iter in range(iterations):
        try:
            
            if (iter != 0) and (iter % eval_iters == 0):
                losses = estimate_loss()
                train_loss = losses['train']
                val_loss = losses['val']
                print(f'Loss at step = {iter} for train data is {train_loss:.4f} for val it is {val_loss:.4f}')

            # sample a batch of data
            xb, yb = get_batch('train')

            # evaluate the loss
            logits, loss = gen.forward(xb, yb, tokenizer)
            if not torch.isnan(loss).any():
                test_deep(logits, yb)
                print(iter, loss)
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                # torch.torch.nn.utils.clip_grad_norm_(gen.parameters(), max_norm=15.0)
                optimizer.step()
            else:
                print('NaN loss')
                print(iter, loss)
                input()
        except Exception as e:
            print(e)