In [4]:
from gpt2model.gpt import GPTModel

In [5]:
GPT_CONFIG_124M = {
        "vocab_size": 50257,
        "context_length": 256,
        "emb_dim": 768,
        "n_heads": 12,
        "n_layers": 12,
        "drop_rate": 0.1,
        "qkv_bias": False
        }

In [38]:
import torch
cfg = GPT_CONFIG_124M
model = GPTModel(GPT_CONFIG_124M)
model.eval()

def generate_text_simple(model, idx, max_new_tokens, context_size):
    # model= kwargs.get('model')
    # idx = kwargs.get('idx')
    # max_new_tokens = kwargs.get('max_new_tokens')
    # context_size = kwargs.get('context_size')
    
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [7]:

def text_to_ids(text, tokenizer):
    ids = tokenizer.encode(text, allowed_special={'<endoftext>'})
    encoded_batch = torch.tensor(ids).unsqueeze(0)
    return encoded_batch

def ids_to_text(ids, tokenizer):
    flat = torch.tensor(ids).squeeze(0)
    return tokenizer.decode(flat.tolist())

In [8]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')
text = 'I am khan'

generated_ids = generate_text_simple(model=model, 
                     idx = text_to_ids(text, tokenizer), 
                     max_new_tokens=6, 
                    context_size=cfg['context_length'])
print(ids_to_text(generated_ids, tokenizer))

I am khan Meet Mal crowdfunding pardon Circle outsider


  flat = torch.tensor(ids).squeeze(0)


In [9]:
text1 = 'every effort moves'
text2 = 'I really like'
inputs = torch.tensor([tokenizer.encode(text1), tokenizer.encode(text2)])
inputs.shape

torch.Size([2, 3])

In [10]:
text1 = 'every effort moves'
text2 = 'really like chocolate'
targets = torch.tensor([tokenizer.encode(text1), tokenizer.encode(text2)])
targets

tensor([[16833,  3626,  6100],
        [27485,   588, 11311]])

In [11]:
with torch.no_grad():
    logits = model(inputs)
probas = torch.softmax(logits,dim=-1)

probas.shape

torch.Size([2, 3, 50257])

In [12]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
token_ids

tensor([[[17979],
         [ 8285],
         [10625]],

        [[44973],
         [  853],
         [25086]]])

In [13]:
token_ids[0].flatten()

tensor([17979,  8285, 10625])

In [14]:
print(f'Targets batch1: {ids_to_text(targets[0], tokenizer)}')
print(f'outputs batch 1: {ids_to_text(token_ids[0].flatten(), tokenizer)}')

Targets batch1: every effort moves
outputs batch 1:  shaderegon dreams


  flat = torch.tensor(ids).squeeze(0)


In [15]:
text_idx = 0
target_probas_1 = probas[text_idx, [0,1,2], targets[text_idx]]
target_probas_1

tensor([1.6607e-05, 3.7716e-05, 1.0087e-05])

In [16]:
text_idx = 1
target_probas_1 = probas[text_idx, [0,1,2], targets[text_idx]]
target_probas_1

tensor([2.4989e-05, 1.0154e-05, 3.3944e-05])

In [17]:
log_probas = torch.log(torch.cat((target_probas_1, target_probas_1)))
avg_log_probas = torch.mean(log_probas)
avg_log_probas

tensor(-10.7952)

In [18]:
logits_flat = logits.flatten(0,1)
targets_flat = targets.flatten()
# print(targets_flat)
# logits_flat[,targets_flat]
# print(logits_flat.shape)
# logg = torch.log(,targets_flat])
# logg


In [19]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
loss

tensor(10.8468)

In [20]:
perplexity = torch.exp(loss)
print(f'Perplexity measure for loss is : {perplexity}')

Perplexity measure for loss is : 51370.23046875


In [21]:
with open('the-verdict.txt', 'r', encoding='utf-8') as file:
    text_data = file.read()

In [22]:
print(f'The total words: {len(text_data)}')
print(f'The total tokens {len(tokenizer.encode(text_data))}')

The total words: 20479
The total tokens 5145


In [23]:
train_ratio = 0.9
split_idx = int(train_ratio*len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]
print(f'Train data size : {len(train_data)} and Validation data size {len(val_data)}')

Train data size : 18431 and Validation data size 2048


In [24]:
from gpt2model.data import create_dataloader_v1

In [25]:
torch.manual_seed(123)
train_loader = create_dataloader_v1(
                    train_data,
                    batch_size=2,
                    max_length=GPT_CONFIG_124M["context_length"],
                    stride=GPT_CONFIG_124M["context_length"],
                    drop_last=True,
                    shuffle=True,
                    num_workers=0
                    )
val_loader = create_dataloader_v1(
                    val_data,
                    batch_size=2,
                    max_length=GPT_CONFIG_124M["context_length"],
                    stride=GPT_CONFIG_124M["context_length"],
                    drop_last=False,
                    shuffle=False,
                    num_workers=0
                    )

In [26]:
print('Train Loader: ')
for x,y in train_loader:
    print(x.shape,y.shape)
      
print('Validation Loader: ')
for x,y in val_loader:
    print(x.shape, y.shape)

Train Loader: 
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
Validation Loader: 
torch.Size([2, 256]) torch.Size([2, 256])


In [27]:
# calculating the batch loss
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
    logits.flatten(0,1), target_batch.flatten())
    return loss

In [28]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader)==0:
        return float('nan')
    elif num_batches is None:
        num_batches = len(data_loader)
    
    else:
        num_batches = min(num_batches, len(data_loader))
        
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i< num_batches:
            loss = calc_loss_batch(
            input_batch, target_batch, model, device)
            total_loss +=loss.item()
        
        else:
            break
            
    return total_loss / num_batches

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPTModel(
  (dropout): Dropout(p=0.1, inplace=False)
  (final_norm): LayerNorm()
  (final_linear_layer): Linear(in_features=768, out_features=50257, bias=False)
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (layer_norm1): LayerNorm()
      (layer_norm2): LayerNorm()
      (mmhatt): MaskMultiHeadAttention(
        (W_Q): Linear(in_features=768, out_features=768, bias=False)
        (W_K): Linear(in_features=768, out_features=768, bias=False)
        (W_V): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (projection): Linear(in_features=768, out_features=768, bias=True)
      )
      (ff): FeedForward(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (gelu): GELU()
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
      )
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): Transform

In [30]:
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model ,device)
    val_loss = calc_loss_loader(val_loader, model, device)

    print('Training loss:', train_loss)    
    print('Validation loss:', val_loss)

Training loss: 10.98483975728353
Validation loss: 11.023316383361816


In [31]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(
            train_loader, model, device, num_batches=eval_iter
        )
        
        val_loss = calc_loss_loader(
            val_loader, model, device, num_batches=eval_iter
        )
    model.train()
    return train_loss, val_loss

In [39]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model, idx=encoded, context_size=context_size
        )
        decoded_text = ids_to_text(token_ids, tokenizer)
        print(decoded_text.replace('\n', ' '))
        model.train()

In [40]:
def train_model_simple(model, train_loader, val_loader,
                      optimizer, device, num_epochs,
                      eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1
    
    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            loss.backward()
            optimizer.step()
            tokens_seen +=input_batch.numel()
            global_step+=1
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                model, train_loader, val_loader, device, eval_iter
                )
                
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                
                print(f"Ep {epoch+1} (step {global_step:06d}): "
                     f"Train loss {train_loss:.3f}, "
                     f"Val_loss {val_loss: .3f}")
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )
    return train_losses, val_losses, track_tokens_seen
            

In [41]:
torch.manual_seed(123)
GPT_CONFIG_124M = {
        "vocab_size": 50257,
        "context_length": 256,
        "emb_dim": 768,
        "n_heads": 2,
        "n_layers": 12,
        "drop_rate": 0.1,
        "qkv_bias": False
        }

model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=0.0004, weight_decay=0.1
    )
num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context="Every effort moves you", tokenizer=tokenizer
    )

Ep 1 (step 000000): Train loss 9.731, Val_loss  9.915
Ep 1 (step 000005): Train loss 7.968, Val_loss  8.395


TypeError: generate_text_simple() takes 0 positional arguments but 1 was given