In [1]:
from gpt2model.gpt import GPTModel

In [2]:
GPT_CONFIG_124M = {
        "vocab_size": 50257,
        "context_length": 256,
        "emb_dim": 768,
        "n_heads": 12,
        "n_layers": 12,
        "drop_rate": 0.1,
        "qkv_bias": False
        }

In [4]:
import torch
cfg = GPT_CONFIG_124M
model = GPTModel(GPT_CONFIG_124M)
model.eval()

def generate_text_simple(**kwargs):
    model= kwargs.get('model')
    idx = kwargs.get('idx')
    max_new_tokens = kwargs.get('max_new_tokens')
    context_size = kwargs.get('context_size')
    
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [5]:

def text_to_ids(text, tokenizer):
    ids = tokenizer.encode(text, allowed_special={'<endoftext>'})
    encoded_batch = torch.tensor(ids).unsqueeze(0)
    return encoded_batch

def ids_to_text(ids, tokenizer):
    flat = torch.tensor(ids).squeeze(0)
    return tokenizer.decode(flat.tolist())

In [6]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
text = 'I am khan'

generated_ids = generate_text_simple(model=model, 
                     idx = text_to_ids(text, tokenizer), 
                     max_new_tokens=6, 
                    context_size=cfg['context_length'])
print(ids_to_text(generated_ids, tokenizer))

ConnectionError: HTTPSConnectionPool(host='openaipublic.blob.core.windows.net', port=443): Max retries exceeded with url: /gpt-2/encodings/main/vocab.bpe (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fd119e6db40>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [8]:
text1 = 'every effort moves'
text2 = 'I really like'
inputs = torch.tensor([tokenizer.encode(text1), tokenizer.encode(text2)])
inputs.shape

torch.Size([2, 3])

In [9]:
text1 = 'every effort moves'
text2 = 'really like chocolate'
targets = torch.tensor([tokenizer.encode(text1), tokenizer.encode(text2)])
targets

tensor([[16833,  3626,  6100],
        [27485,   588, 11311]])

In [10]:
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits,dim=-1)

probas.shape

torch.Size([2, 3, 50257])

In [11]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
token_ids

tensor([[[16087],
         [11655],
         [41581]],

        [[ 2034],
         [24980],
         [38014]]])

In [12]:
token_ids[0].flatten()

tensor([16087, 11655, 41581])

In [13]:
print(f'Targets batch1: {ids_to_text(targets[0], tokenizer)}')
print(f'outputs batch 1: {ids_to_text(token_ids[0].flatten(), tokenizer)}')

Targets batch1: every effort moves
outputs batch 1:  inexaneous Persona


  flat = torch.tensor(ids).squeeze(0)


In [14]:
text_idx = 0
target_probas_1 = probas[text_idx, [0,1,2], targets[text_idx]]
target_probas_1

tensor([1.7766e-05, 1.7907e-05, 1.1899e-05])

In [15]:
text_idx = 1
target_probas_1 = probas[text_idx, [0,1,2], targets[text_idx]]
target_probas_1

tensor([1.8352e-05, 3.6146e-05, 1.8436e-05])

In [16]:
log_probas = torch.log(torch.cat((target_probas_1, target_probas_1)))
avg_log_probas = torch.mean(log_probas)
avg_log_probas

tensor(-10.6783)

In [17]:
logits_flat = logits.flatten(0,1)
targets_flat = targets.flatten()
# print(targets_flat)
# logits_flat[,targets_flat]
# print(logits_flat.shape)
# logg = torch.log(,targets_flat])
# logg


In [18]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
loss

tensor(10.8737)

In [19]:
perplexity = torch.exp(loss)
print(f'Perplexity measure for loss is : {perplexity}')

Perplexity measure for loss is : 52772.421875


In [20]:
with open('the-verdict.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

In [21]:
print(f'The total words: {len(text_data)}')
print(f'The total tokens {len(tokenizer.encode(text_data))}')

The total words: 20479
The total tokens 5145


In [22]:
train_ratio = 0.9
split_idx = int(train_ratio*len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]
print(f'Train data size : {len(train_data)} and Validation data size {len(val_data)}')

Train data size : 18431 and Validation data size 2048


In [23]:
from gpt2model.data import create_dataloader_v1

In [24]:
torch.manual_seed(123)
train_loader = create_dataloader_v1(
                    train_data,
                    batch_size=2,
                    max_length=GPT_CONFIG_124M["context_length"],
                    stride=GPT_CONFIG_124M["context_length"],
                    drop_last=True,
                    shuffle=True,
                    num_workers=0
                    )
val_loader = create_dataloader_v1(
                    val_data,
                    batch_size=2,
                    max_length=GPT_CONFIG_124M["context_length"],
                    stride=GPT_CONFIG_124M["context_length"],
                    drop_last=False,
                    shuffle=False,
                    num_workers=0
                    )

In [39]:
print('Train Loader: ')
for x,y in train_loader:
    print(x.shape,y.shape)
      
print('Validation Loader: ')
for x,y in val_loader:
    print(x.shape, y.shape)

Train Loader: 
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
Validation Loader: 
torch.Size([2, 256]) torch.Size([2, 256])
