In [1]:
with open("./data/the-verdict.txt", encoding='utf-8') as f:
    raw_text = f.read()
print(len(raw_text))
print(raw_text[:99])

20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
import re
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text) 
preprocessed = [item.strip() for item in preprocessed if item.strip()] 
print(len(preprocessed))

4690


In [3]:
all_words = sorted(set(preprocessed)) 
vocab_size = len(all_words) 
print(vocab_size)

1130


In [4]:
# Creating Vocabulary
vocab = {token:integer for integer,token in enumerate(all_words)} 
for i, item in enumerate(vocab.items()): 
    print(item) 
    if i >= 50: 
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [5]:
class SimpleTokenizerV1: 
    def __init__(self, vocab): 
        self.str_to_int = vocab 
        self.int_to_str = {i:s for s,i in vocab.items()}  
    
    def encode(self, text): 
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text) 
        preprocessed = [ item.strip() for item in preprocessed if item.strip() ]  
        ids = [self.str_to_int[s] for s in preprocessed] 
        return ids  
    
    def decode(self, ids): 
        text = " ".join([self.int_to_str[i] for i in ids])  
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) 
        return text

In [6]:
all_tokens = sorted(list(set(preprocessed))) 
all_tokens.extend(["<|endoftext|>", "<|unk|>"]) 
vocab = {token:integer for integer,token in enumerate(all_tokens)}  
print(len(vocab.items()))

1132


In [7]:
print(vocab['do'])

355


In [8]:
class SimpleTokenizerV2: 
    def __init__(self, vocab): 
        self.str_to_int = vocab 
        self.int_to_str = {i:s for s,i in vocab.items()}  
    
    def encode(self, text): 
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text) 
        preprocessed = [ item.strip() for item in preprocessed if item.strip() ]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed] 
        return ids  
    
    def decode(self, ids): 
        text = " ".join([self.int_to_str[i] for i in ids])  
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) 
        return text

In [9]:
text1 = "Hello,do you like tea?" 
text2 = "In the sunlit terraces of the palace." 
text = " <|endoftext|> ".join((text1, text2)) 
print(text)

Hello,do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [10]:
tokenizer = SimpleTokenizerV2(vocab) 
print(tokenizer.encode(text))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [11]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [12]:
import tiktoken
tiktoken.__version__

'0.8.0'

In [13]:
sample_text = "Akwirw ier"
tokenizer = tiktoken.get_encoding("gpt2")

In [14]:
tokenizer.encode(sample_text)

[33901, 86, 343, 86, 220, 959]

In [15]:
tokenizer.decode(tokenizer.encode(sample_text))

'Akwirw ier'

In [16]:
tokenizer.decode([343])

'ir'

In [17]:
import torch 
from torch.utils.data import DataLoader, Dataset

class GPTDatasetV1(Dataset): 
    def __init__(self,txt, tokenizer, max_length, stride):
        super().__init__()
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride): 
            input_chunk = token_ids[i:i + max_length] 
            target_chunk = token_ids[i + 1: i + max_length + 1] 
            self.input_ids.append(torch.tensor(input_chunk)) 
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [18]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0): 
    tokenizer = tiktoken.get_encoding("gpt2") 
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride) 
    dataloader = DataLoader( dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers )  
    return dataloader

In [19]:
with open("./data/the-verdict.txt", encoding='utf-8') as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)

In [20]:
first_batch

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]

In [21]:
second_batch = next(data_iter)
second_batch

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]

In [22]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=8, stride=2, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)

print(first_batch, second_batch)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])] [tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257]])]


In [23]:
# Coding Final DataLoader and Embedding Layer
max_length = 4 
dataloader = create_dataloader_v1( raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)

data_iter = iter(dataloader) 
inputs, targets = next(data_iter)

vocab_size = 50257 
output_dim = 256 
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embeddings = token_embedding_layer(inputs)

print(token_embeddings.shape)

context_length = max_length 
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim) 
pos_embeddings = pos_embedding_layer(torch.arange(context_length)) 
print(pos_embeddings.shape)

input_embeddings = token_embeddings + pos_embeddings 
print(input_embeddings.shape)


torch.Size([8, 4, 256])
torch.Size([4, 256])
torch.Size([8, 4, 256])


In [24]:
# Casual Attention
import torch.nn as nn
class CausalAttention(nn.Module): 
    def __init__(self, d_in, d_out, context_length, dropout, qkv_bias=False): 
        super().__init__() 
        self.d_out = d_out 
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) 
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) 
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) 
        self.dropout = nn.Dropout(dropout) 
        self.register_buffer( 'mask', torch.triu(torch.ones(context_length, context_length), diagonal=1) )  
    def forward(self, x): 
        b, num_tokens, d_in = x.shape 
        keys = self.W_key(x) 
        queries = self.W_query(x) 
        values = self.W_value(x)  
        attn_scores = queries @ keys.transpose(1, 2) 
        attn_scores.masked_fill_( self.mask.bool()[:num_tokens, :num_tokens], -torch.inf) 
        attn_weights = torch.softmax( attn_scores / keys.shape[-1]**0.5, dim=-1 )  
        attn_weights = self.dropout(attn_weights)  
        context_vec = attn_weights @ values 
        return context_vec

In [25]:
d_in = 4
d_out = 2
inputs, targets = next(data_iter)
batch = torch.stack((inputs, inputs), dim=0)
batch = batch.to(torch.float32)
#print(batch.shape)
torch.manual_seed(123) 
context_length = batch.shape[1] 
ca = CausalAttention(d_in, d_out, context_length, 0.0) 
context_vecs = ca(batch) 
print("context_vecs.shape:", context_vecs.shape)

context_vecs.shape: torch.Size([2, 8, 2])


In [26]:
# Multihead Attention

class MultiHeadAttention(nn.Module): 
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False): 
        super().__init__() 
        assert (d_out % num_heads == 0),"d_out must be divisible by num_heads"  
        self.d_out = d_out 
        self.num_heads = num_heads 
        self.head_dim = d_out // num_heads 
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias) 
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias) 
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias) 
        self.out_proj = nn.Linear(d_out, d_out) 
        self.dropout = nn.Dropout(dropout) 
        self.register_buffer( "mask", torch.triu(torch.ones(context_length, context_length), diagonal=1) )  
    def forward(self, x): 
        b, num_tokens, d_in = x.shape 
        keys = self.W_key(x) 
        queries = self.W_query(x) 
        values = self.W_value(x)  
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) 
        values = values.view(b, num_tokens, self.num_heads, self.head_dim) 
        queries = queries.view( b, num_tokens, self.num_heads, self.head_dim )  
        keys = keys.transpose(1, 2) 
        queries = queries.transpose(1, 2) 
        values = values.transpose(1, 2)  
        attn_scores = queries @ keys.transpose(2, 3) 
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]  
        attn_scores.masked_fill_(mask_bool, -torch.inf)  
        attn_weights = torch.softmax( attn_scores / keys.shape[-1]**0.5, dim=-1) 
        attn_weights = self.dropout(attn_weights)  
        context_vec = (attn_weights @ values).transpose(1, 2)  
        context_vec = context_vec.contiguous().view( b, num_tokens, self.d_out )  
        context_vec = self.out_proj(context_vec) 
        return context_vec

In [27]:
batch.shape

torch.Size([2, 8, 4])

In [28]:
torch.manual_seed(123) 
batch_size, context_length, d_in = batch.shape 
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0.0, num_heads=2) 
context_vecs = mha(batch) 
print(context_vecs) 
print("context_vecs.shape:", context_vecs.shape)

tensor([[[  462.0290,   762.9630],
         [-1123.9601,  -801.5990],
         [ -772.2999,  -312.3695],
         [ -884.3900,  -565.2656],
         [ 5737.0591,  9067.5664],
         [ 1557.8021,  4944.7734],
         [  894.6220,  4290.5532],
         [ 9075.3066, 14684.4844]],

        [[  462.0290,   762.9630],
         [-1123.9601,  -801.5990],
         [ -772.2999,  -312.3695],
         [ -884.3900,  -565.2656],
         [ 5737.0591,  9067.5664],
         [ 1557.8021,  4944.7734],
         [  894.6220,  4290.5532],
         [ 9075.3066, 14684.4844]]], grad_fn=<ViewBackward0>)
context_vecs.shape: torch.Size([2, 8, 2])


In [29]:
GPT_CONFIG_124M = { "vocab_size": 50257, # Vocabulary size 
                   "context_length": 1024, # Context length 
                   "emb_dim": 768, # Embedding dimension 
                   "n_heads": 12, # Number of attention heads 
                   "n_layers": 12, # Number of layers 
                   "drop_rate": 0.1, # Dropout rate 
                   "qkv_bias": False, # Query-Key-Value bias
                   }

In [30]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean = x.mean(dim=-1, keepdim=True) 
        var = x.var(dim=-1, keepdim=True, unbiased=False) 
        norm_x = (x - mean) / torch.sqrt(var + self.eps) 
        return self.scale * norm_x + self.shift

In [31]:
class GELU(nn.Module): 
    def __init__(self): 
        super().__init__()  
    def forward(self, x): 
        return 0.5 * x * (1 + torch.tanh( torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

class FeedForward(nn.Module): 
    def __init__(self, cfg): 
        super().__init__() 
        self.layers = nn.Sequential( nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]), 
                                    GELU(), 
                                    nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]), )  
    def forward(self, x): 
        return self.layers(x)

In [32]:
class TransformerBlock(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=config["emb_dim"],
            d_out=config["emb_dim"],
            context_length=config['context_length'],
            dropout=config["drop_rate"], 
            num_heads=config["n_heads"],
            qkv_bias=config["qkv_bias"]
        )
        self.feed_forward = FeedForward(config)
        self.norm1 = LayerNorm(config["emb_dim"])
        self.norm2 = LayerNorm(config["emb_dim"])
        self.drop_shortcut = nn.Dropout(config["drop_rate"])
    def forward(self,x):
        shortcut = x 
        x = self.norm1(x) 
        x = self.att(x) 
        x = self.drop_shortcut(x) 
        x = x + shortcut  
        shortcut = x 
        x = self.norm2(x) 
        x = self.feed_forward(x) 
        x = self.drop_shortcut(x) 
        x = x + shortcut 
        return x



In [33]:
class GPTModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embd = nn.Embedding(config['vocab_size'], config['emb_dim'])
        self.pos_embd = nn.Embedding(config['context_length'], config['emb_dim'])
        self.drop_embd = nn.Dropout(config['drop_rate'])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(config) for _ in range(config['n_layers'])])
        self.final_norm = LayerNorm(config["emb_dim"])
        self.out_head = nn.Linear( config["emb_dim"], config["vocab_size"], bias=False )
    def forward(self,inp):
        batch_size, seq_len = inp.shape
        tok_embeds = self.token_embd(inp)
        pos_embeds = self.pos_embd( torch.arange(seq_len, device= inp.device) )
        x = tok_embeds + pos_embeds
        x = self.drop_embd(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        x = self.out_head(x)
        return x

In [34]:
import tiktoken  
tokenizer = tiktoken.get_encoding("gpt2") 
batch = [] 
txt1 = "Every effort moves you" 
txt2 = "Every day holds a"  
batch.append(torch.tensor(tokenizer.encode(txt1))) 
batch.append(torch.tensor(tokenizer.encode(txt2))) 
batch = torch.stack(batch, dim=0) 
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [35]:
torch.manual_seed(123) 
model = GPTModel(GPT_CONFIG_124M)  
out = model(batch) 
print("Input batch:\n", batch) 
print("\nOutput shape:", out.shape) 
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.1381,  0.0077, -0.1963,  ..., -0.0222, -0.1060,  0.1717],
         [ 0.3865, -0.8408, -0.6564,  ..., -0.5163,  0.2369, -0.3357],
         [ 0.6989, -0.1829, -0.1631,  ...,  0.1472, -0.6504, -0.0056],
         [-0.4290,  0.1669, -0.1258,  ...,  1.1579,  0.5303, -0.5549]],

        [[ 0.1094, -0.2894, -0.1467,  ..., -0.0557,  0.2911, -0.2824],
         [ 0.0882, -0.3552, -0.3527,  ...,  1.2930,  0.0053,  0.1898],
         [ 0.6091,  0.4702, -0.4094,  ...,  0.7688,  0.3787, -0.1974],
         [-0.0612, -0.0737,  0.4751,  ...,  1.2463, -0.3834,  0.0609]]],
       grad_fn=<UnsafeViewBackward0>)


In [36]:
total_params = sum(p.numel() for p in model.parameters()) 
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,009,536


In [37]:
total_params_gpt2 = ( total_params - sum(p.numel() for p in model.out_head.parameters()) )  
print(f"Number of trainable parameters " f"considering weight tying: {total_params_gpt2:,}" )

Number of trainable parameters considering weight tying: 124,412,160


In [38]:
total_multihead_params = sum([p.numel() for p in model.trf_blocks.parameters()])
print(f"Number of trainable parameters in multihead is {total_multihead_params} ")


Number of trainable parameters in multihead is 85026816 


In [39]:
trns = TransformerBlock(config=GPT_CONFIG_124M)
params = 0
for x,y in trns.named_modules():
    if 'feed_forward' in x:
         print(x)
         for p in y.parameters():
              params = params + p.numel()

print(f"Total number of Feed Forward Parameters are {params}")

feed_forward
feed_forward.layers
feed_forward.layers.0
feed_forward.layers.1
feed_forward.layers.2
Total number of Feed Forward Parameters are 14167296


In [40]:
def generate_text_simple(model, idx, max_new_tokens, context_size): 
    for _ in range(max_new_tokens): 
        idx_cond = idx[:, -context_size:] 
        with torch.no_grad(): 
            logits = model(idx_cond)  
            logits = logits[:, -1, :] 
            probas = torch.softmax(logits, dim=-1) 
            idx_next = torch.argmax(probas, dim=-1, keepdim=True) 
            idx = torch.cat((idx, idx_next), dim=1)  
            return idx

In [41]:
GPT_CONFIG_124M = { "vocab_size": 50257, "context_length": 256, "emb_dim": 768, "n_heads": 12, "n_layers": 12, "drop_rate": 0.1, "qkv_bias": False }  
torch.manual_seed(123) 
model = GPTModel(GPT_CONFIG_124M) 
model.eval()

GPTModel(
  (token_embd): Embedding(50257, 768)
  (pos_embd): Embedding(256, 768)
  (drop_embd): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Lin

In [42]:
import tiktoken
def text_to_token_ids(text,tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids,tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you" 
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple( model=model, idx=text_to_token_ids(start_context, tokenizer), max_new_tokens=10, context_size=GPT_CONFIG_124M["context_length"] )
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you renting


# Training of GPT Model

In [45]:
file_path = "./data/the-verdict.txt" 
with open(file_path, "r", encoding="utf-8") as file: 
    text_data = file.read()

total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters) 
print("Tokens:", total_tokens)

Characters: 20479
Tokens: 5145


In [48]:
train_ratio = 0.90 
split_idx = int(train_ratio * len(text_data)) 
train_data = text_data[:split_idx] 
val_data = text_data[split_idx:]

In [None]:
train_loader = create_dataloader_v1( train_data, batch_size=2, max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], drop_last=True, shuffle=True, num_workers=0 )
val_loader = create_dataloader_v1( val_data, batch_size=2, max_length=GPT_CONFIG_124M["context_length"], stride=GPT_CONFIG_124M["context_length"], drop_last=True, shuffle=False, num_workers=0 )

In [51]:
def calc_loss_batch(input_batch, target_batch, model, device): 
    input_batch = input_batch.to(device) 
    target_batch = target_batch.to(device) 
    logits = model(input_batch) 
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())  
    return loss

In [52]:
def calc_loss_loader(data_loader, model, device, num_batches=None): 
    total_loss = 0. 
    if len(data_loader) == 0: 
        return float("nan") 
    elif num_batches is None: 
        num_batches = len(data_loader) 
    else: 
        num_batches = min(num_batches, len(data_loader)) 
        for i, (input_batch, target_batch) in enumerate(data_loader): 
            if i < num_batches: 
                loss = calc_loss_batch( input_batch, target_batch, model, device )  
                total_loss += loss.item() 
            else: 
                break 
        return total_loss / num_batches

In [62]:
def generate_and_print_sample(model, tokenizer, device, start_context): 
    model.eval() 
    context_size = model.pos_embd.weight.shape[0] 
    encoded = text_to_token_ids(start_context, tokenizer).to(device) 
    with torch.no_grad(): 
        token_ids = generate_text_simple( model=model, idx=encoded, max_new_tokens=50, context_size=context_size )  
        decoded_text = token_ids_to_text(token_ids, tokenizer) 
        print(decoded_text.replace("\n", " ")) 
        model.train()

def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch( input_batch, target_batch, model, device )
            loss.backward() 
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0: 
                train_loss, val_loss = evaluate_model( model, train_loader, val_loader, device, eval_iter) 
                train_losses.append(train_loss) 
                val_losses.append(val_loss) 
                track_tokens_seen.append(tokens_seen) 
                print(f"Ep {epoch+1} (Step {global_step:06d}): " f"Train loss {train_loss:.3f}, " f"Val loss {val_loss:.3f}" )  
                generate_and_print_sample( model, tokenizer, device, start_context )  
    return train_losses, val_losses, track_tokens_seen

def evaluate_model(model, train_loader, val_loader, device, eval_iter): 
    model.eval() 
    with torch.no_grad(): 
        train_loss = calc_loss_loader( train_loader, model, device, num_batches=eval_iter )  
        val_loss = calc_loss_loader( val_loader, model, device, num_batches=eval_iter )  
        model.train() 
    return train_loss, val_loss



In [63]:
torch.manual_seed(123) 
model = GPTModel(GPT_CONFIG_124M) 
device = 'cuda'
model.to(device) 
optimizer = torch.optim.AdamW( model.parameters(), lr=0.0004, weight_decay=0.1 )


num_epochs = 10 
train_losses, val_losses, tokens_seen = train_model_simple( model, train_loader, val_loader, optimizer, device, num_epochs=num_epochs, eval_freq=5, eval_iter=5, start_context="Every effort moves you", tokenizer=tokenizer )

Ep 1 (Step 000000): Train loss 9.819, Val loss 9.924
Every effort moves you,
Ep 1 (Step 000005): Train loss 8.069, Val loss 8.340
Every effort moves you,
Ep 2 (Step 000010): Train loss 6.623, Val loss 7.051
Every effort moves you,
Ep 2 (Step 000015): Train loss 6.045, Val loss 6.601
Every effort moves you,
Ep 3 (Step 000020): Train loss 5.518, Val loss 6.519
Every effort moves you,
Ep 3 (Step 000025): Train loss 5.361, Val loss 6.382
Every effort moves you,
Ep 4 (Step 000030): Train loss 4.764, Val loss 6.242
Every effort moves you.
Ep 4 (Step 000035): Train loss 4.750, Val loss 6.362
Every effort moves you of
Ep 5 (Step 000040): Train loss 3.893, Val loss 6.149
Every effort moves you know
Ep 6 (Step 000045): Train loss 3.577, Val loss 6.189
Every effort moves you know
Ep 6 (Step 000050): Train loss 3.044, Val loss 6.139
Every effort moves you know
Ep 7 (Step 000055): Train loss 2.915, Val loss 6.132
Every effort moves you know
Ep 7 (Step 000060): Train loss 2.176, Val loss 6.130
Every