In [5]:
from train import train_model
from model import GPTModel
from data_loader import create_dataloders
from generate import generate_text
import torch

In [2]:
with open ('sample_text_data.txt', 'r') as f:
    text_data = f.read()

text_data[:100]

"Japan's ageing workforce: built to last\n\nIn his twenties he battled tuberculosis for eight years, th"

## DataLoader

In [3]:
train_loader, val_loader = create_dataloders(text_data=text_data, batch_size=32, max_length=1024, stride=64, train_ratio=0.9, requires_val=True)

In [4]:
batch = next(iter(train_loader))
inputs, labels = batch
print(inputs.shape)   
print(labels.shape)   
print(inputs.dtype)  

torch.Size([32, 1024])
torch.Size([32, 1024])
torch.int64


## Intantiate model

In [None]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length" : 1024,
    "embed_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [6]:
torch.manual_seed(123)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPTModel(GPT_CONFIG_124M)
model.to(device)

GPTModel(
  (tok_embed): Embedding(50257, 768)
  (pos_embed): Embedding(2, 768)
  (drop_embed): Dropout(p=0.1, inplace=False)
  (trf_block): Sequential(
    (0): TransformerBlock(
      (att): MultiheadAttention(
        (w_q): Linear(in_features=768, out_features=768, bias=False)
        (w_k): Linear(in_features=768, out_features=768, bias=False)
        (w_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiheadAttention(
        (w_q): Linear(in_features=768, out_f

In [7]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 162,224,640


## Train

In [27]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 1

train_loss, val_loss = train_model(model=model, train_loader=train_loader, val_loader=val_loader, num_epochs=num_epochs, optimizer=optimizer, device=device)

Epoch 1/1 | train=6.1191 | val=9.4243


## Generate text

In [29]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [30]:
output_text = generate_text(model=model, tokenizer=tokenizer, device=device, start_context="Once upon a time", max_new_tokens=10)

In [31]:
output_text

'Once upon a time And,,,,,, to to to'