In [39]:
import torch
import polars as pl

In [40]:
vocab = {
    "closer": 0,
    "every": 1,
    "effort": 2,
    "forward": 3,
    "inches": 4,
    "moves": 5,
    "pizza": 6,
    "toward": 7,
    "you": 8,
}

inverse_vocab = {v: k for k, v in vocab.items()}

next_token_logits = torch.tensor(
    [4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
)
probas = torch.softmax(next_token_logits, dim=0)
print(probas)
print(torch.argmax(probas).item())
print(inverse_vocab[3])

tensor([6.0907e-02, 1.6313e-03, 1.0019e-04, 5.7212e-01, 3.4190e-03, 1.3257e-04,
        1.0120e-04, 3.5758e-01, 4.0122e-03])
3
forward


In [41]:
torch.manual_seed(123)
sample = [torch.multinomial(probas, num_samples=1).item() for i in range(1000)]

In [42]:
print(len(sample))

1000


In [43]:
sampled_ids = torch.bincount(torch.tensor(sample))

In [44]:
print(sampled_ids)

tensor([ 73,   0,   0, 582,   2,   0,   0, 343])


In [45]:
for i , freq in enumerate(sampled_ids):
    print(f"{freq} x {inverse_vocab[i]}")

73 x closer
0 x every
0 x effort
582 x forward
2 x inches
0 x moves
0 x pizza
343 x toward


In [46]:
def softmax_with_temperature(logits, temperature):
    probas = torch.softmax(logits/temperature,dim=0)
    return probas
    

In [47]:
temperatures =[1,0.1,5]
scaled_probas= [softmax_with_temperature(next_token_logits, t) for t in temperatures]

In [48]:
scaled_probas

[tensor([6.0907e-02, 1.6313e-03, 1.0019e-04, 5.7212e-01, 3.4190e-03, 1.3257e-04,
         1.0120e-04, 3.5758e-01, 4.0122e-03]),
 tensor([1.8530e-10, 3.5189e-26, 2.6890e-38, 9.9099e-01, 5.7569e-23, 4.4220e-37,
         2.9718e-38, 9.0133e-03, 2.8514e-22]),
 tensor([0.1546, 0.0750, 0.0429, 0.2421, 0.0869, 0.0454, 0.0430, 0.2203, 0.0898])]

In [49]:
torch.topk(next_token_logits,k=3)

torch.return_types.topk(
values=tensor([6.7500, 6.2800, 4.5100]),
indices=tensor([3, 7, 0]))

In [50]:
import sys
from pathlib import Path
cur_dir= Path().cwd().parents[0]
#print(cur_dir)
sys.path.append(str(cur_dir))
from src.model.transformer_block import CONFIG, GPTModel_v2

In [51]:
GPT2_small_config = {
    "vocab_size": 50257,  # Size of the vocabulary used by the model
    "context_length": 1024,  # Maximum length of input sequences
    "emb_dim": 256,  # Dimensionality of the model's embeddings (d_model)
    "n_heads": 16,  # Number of attention heads in the multi-head attention mechanism
    "n_layers": 24,  # Number of transformer layers in the model
    "drop_rate": 0.1,  # Dropout rate for regularization
    "qkv_bias": False,  # Whether to include bias terms in the query, key, and value projections
}

model = GPTModel_v2(GPT2_small_config)

In [52]:
torch.save(model.state_dict(), "model.pth")

In [53]:
optimizer= torch.optim.AdamW(model.parameters(), lr= 0.0004, weight_decay=0.1)

In [54]:
torch.save({"model":model.state_dict(), "optimizer":optimizer.state_dict()},
           "model_and_optimizer.pth")