<a href="https://colab.research.google.com/github/prapti2024/LLM_from_scratch/blob/main/gpt_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## TOKENIZATION


Import tiktoken


In [1]:
import torch

In [2]:
!pip3 install tiktoken



In [3]:
import importlib
import tiktoken
print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.9.0


In [4]:
tokenizer = tiktoken.get_encoding("gpt2")

Implementing a DataLoader

In [5]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [6]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

Creating Token Embeddings

In [7]:
vocab_size = 50257
output_dim = 256
context_length = 4

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [8]:
print(token_embedding_layer.weight)


Parameter containing:
tensor([[ 0.3338, -1.8135, -0.1289,  ..., -0.0517,  0.8661,  1.5792],
        [-1.9769,  0.2740,  0.5075,  ...,  1.0264, -0.4150, -1.7623],
        [-0.0171, -1.8730, -0.4840,  ...,  0.5212, -0.2117,  0.0134],
        ...,
        [ 1.6442, -1.1395, -0.1527,  ...,  0.6767, -0.0858, -0.5469],
        [ 1.5911,  2.2329,  2.0597,  ..., -0.3386, -0.1779,  0.4726],
        [ 1.2682,  1.4984, -1.1810,  ...,  0.2742, -0.7217,  0.4535]],
       requires_grad=True)


In [9]:
token_embeddings = token_embedding_layer(torch.arange(context_length))
print(token_embeddings.shape)

torch.Size([4, 256])


Positional embeddings


In [10]:
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [11]:
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [12]:
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [13]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([4, 256])


Creating an instance of data loader.

# GPT-2 BASED TRANSFORMER BLOCK

In [14]:
GPT_CONFIG_124M = {
    "vocab_size" : 50257,
    "context_length" : 1024,
    "emb_dim" : 768,
    "n_heads": 12, #no of attention heads
    "n_layers" : 12, #no_of transformers
    "drop_rate" : 0.1, #10% of neurons are set to 0
    "qkv_bias" :  False #not required bias term rn
}

**Layer Normalization class**

In [15]:
import torch
from torch import nn

In [16]:
class LayerNorm(nn.Module):
  def __init__(self,emb_dim):
    super().__init__()
    self.eps = 1e-5
    self.scale = nn.Parameter(torch.ones(emb_dim))
    self.shift = nn.Parameter(torch.zeros(emb_dim))

  def forward(self,x):
    mean = x.mean(dim = -1,keepdim = True )
    var = x.var(dim = -1,keepdim = True, unbiased = False)
    norm_x = (x - mean)/torch.sqrt(var + self.eps)
    return self.scale * norm_x + self.shift




**GELU ACTIVATION FUNCTION**

In [17]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self,x):
     return 0.5 * x *(1 + torch.tanh(torch.sqrt(torch.tensor(2/torch.pi)) * (x + 0.044715 * torch.pow(x,3))))

**Feed Forward Layer**

In [18]:
class FeedForward(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Linear(cfg["emb_dim"],4*cfg["emb_dim"]),
        GELU(),
        nn.Linear(4*cfg["emb_dim"],cfg["emb_dim"]),
    )

  def forward(self,x):
    return self.layers(x)


Multihead Attention Class

In [19]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec


**TRANSFORMER BLOCK**

In [20]:
class TransformerBlock(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.att = MultiHeadAttention(
        d_in = cfg["emb_dim"],
        d_out = cfg["emb_dim"],
        context_length = cfg["context_length"],
        dropout = cfg["drop_rate"],
        num_heads = cfg["n_heads"],
        qkv_bias=cfg["qkv_bias"]
    )
    self.ff = FeedForward(cfg)
    self.norm1 = LayerNorm(cfg["emb_dim"])
    self.norm2 = LayerNorm(cfg["emb_dim"])
    self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

  def forward(self,x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x


In [21]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)
print("Input size: \n",x.shape)
print("Output size: \n",output.shape)

Input size: 
 torch.Size([2, 4, 768])
Output size: 
 torch.Size([2, 4, 768])


# CODE ENTIRE GPT-2 MODEL

In [22]:
class GPTModel(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.tok_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
    self.pos_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
    self.drop_emb = nn.Dropout(cfg["drop_rate"])

    self.trf_blocks = nn.Sequential(
        *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
    )

    self.final_norm = LayerNorm(cfg["emb_dim"])
    self.out_head = nn.Linear(cfg["emb_dim"],cfg["vocab_size"],bias = False)

  def forward(self,in_idx):
      batch_size,seq_len = in_idx.shape
      tok_embeddings = self.tok_emb(in_idx)
      pos_embeddings = self.pos_emb(torch.arange(seq_len,device = in_idx.device))
      x = tok_embeddings + pos_embeddings
      x = self.drop_emb(x)
      x = self.trf_blocks(x)
      x = self.final_norm(x)
      logits = self.out_head(x)
      return logits


In [23]:
batch = torch.tensor([[6109,3626,6100,345],
                      [6109,1110,6622,257]])

In [24]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
out = model(batch)
print("Input: ",batch)
print("Output Shape : ",out.shape)
print(out)

Input:  tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Output Shape :  torch.Size([2, 4, 50257])
tensor([[[-0.0658,  0.2695,  0.2692,  ...,  1.3852, -0.0546, -0.5503],
         [ 0.3035, -0.1901, -0.7108,  ...,  0.3340,  0.5741, -0.2450],
         [-0.1009,  0.3720, -0.3529,  ..., -0.2624, -0.1344,  0.6451],
         [ 0.2338,  0.9179,  0.4180,  ...,  0.6700, -1.1723, -0.1790]],

        [[-0.1588,  0.3306,  0.4211,  ...,  1.3506,  0.4324, -0.5578],
         [ 0.5375,  0.0084, -0.0601,  ...,  0.7958,  0.5095, -0.1663],
         [ 0.0115, -0.1516, -0.1547,  ..., -0.0059, -1.0677,  0.2764],
         [ 0.3840,  0.5324,  0.4590,  ...,  0.8971, -0.8554, -0.2125]]],
       grad_fn=<UnsafeViewBackward0>)


In [25]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params}")

Total number of parameters: 200820480


In [26]:
#Space taken by model
total_size_bytes = 4 * total_params
total_size_mb = total_size_bytes/(1024**2)
print(f"Total size of the model: {total_size_mb:.2f} MB")

Total size of the model: 766.07 MB


## Predict the next token from logits

In [27]:
#Every vocabulary and its corresponding id are seen to predict new word
def generate_text_simple(model,idx,max_new_tokens,context_size):
  for _ in range(max_new_tokens):
    idx_cond = idx[:, -context_size:] #Only use last 5 tokens

    with torch.no_grad():
      logits = model(idx_cond)
      logits = logits[:, -1, :] #Take last row from each batch
      probas = torch.softmax(logits,dim = -1)
      idx_next = torch.argmax(probas,dim = -1,keepdim = True)
      idx = torch.cat((idx,idx_next),dim = 1)
  return idx



In [28]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
print(f"Encoded Tensor: {encoded}")
print(f"Encoded Tensor Shape: {encoded_tensor.shape}")

#Output is input batch

Encoded Tensor: [15496, 11, 314, 716]
Encoded Tensor Shape: torch.Size([1, 4])


In [29]:
model.eval()
out = generate_text_simple(
    model = model,
    idx = encoded_tensor,
    max_new_tokens = 6,
    context_size = GPT_CONFIG_124M['context_length']
)
print("Output:",out)
print("Output Length:",len(out[0]))
#Output length is 4 input token + 6 new token

Output: tensor([[15496,    11,   314,   716, 15851,  4029, 45838, 37045, 44119,  4933]])
Output Length: 10


In [30]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)
#The result is random because we havent trained it yet, we will train it later

Hello, I am eternalml PLAATIVE premieredUp


Measuring the loss function

In [31]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)


In [32]:
inputs = torch.tensor([[16833, 3626, 6100],
                       [40,1107,588]])

targets = torch.tensor([[3626, 6100, 345],
                        [1107,588,11311]])

In [33]:
with torch.no_grad():
  logits = model(batch)

  probas = torch.softmax(logits,dim = -1)
  print(probas.shape)

torch.Size([2, 4, 50257])


In [34]:
#Argmax returns indices of maximun values in tenosor
token_ids = torch.argmax(probas,dim = -1, keepdim = True)
print("Token IDs: ", token_ids)

Token IDs:  tensor([[[ 4125],
         [13071],
         [46228],
         [31069]],

        [[ 4125],
         [42457],
         [20787],
         [14817]]])


In [35]:
print(f"The target is: {token_ids_to_text(token_ids[0].flatten(),tokenizer)}")
print(f"The output is :{token_ids_to_text(targets[0].flatten(),tokenizer)}")

The target is: spe Hans Extend Hearth
The output is : effort moves you


In [36]:
text_idx0 = 0
target_probas_1 = probas[text_idx0,[0,1,2],targets[text_idx0]]
print("Text1: ",target_probas_1)

text_idx1 = 1
target_probas_2 = probas[text_idx1,[0,1,2],targets[text_idx1]]
print("Text2: ",target_probas_2)

Text1:  tensor([2.4666e-05, 4.8920e-05, 9.9761e-06])
Text2:  tensor([6.2990e-05, 2.8812e-05, 1.4677e-05])


In [37]:
log_probas= torch.log(torch.cat((target_probas_1,target_probas_2)))
print(log_probas)

tensor([-10.6101,  -9.9253, -11.5153,  -9.6725, -10.4547, -11.1292])


In [38]:
avg_log_probas = torch.mean(log_probas)
print(avg_log_probas)

tensor(-10.5512)


In [39]:
neg_avg_log_probas = -1 * avg_log_probas
print(neg_avg_log_probas)

tensor(10.5512)


Perplexity


In [40]:
perplexity = torch.exp(neg_avg_log_probas)
print(perplexity)

tensor(38223.0938)


##Evaluation of LLM on real world dataset

In [41]:
#90% of data is for training, 10% for testing
#LLMs are auto regressive, meaning we dont label anything prehand

In [42]:
import requests
import os

file_path = "the-verdict.txt"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt"

if not os.path.exists(file_path):
    resp = requests.get(url)
    text_data = resp.text  # Already a string
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(text_data)
else:
    with open(file_path, "r", encoding="utf-8") as file:
        text_data = file.read()


In [43]:
print(text_data[:99])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [44]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:",total_tokens)

Characters: 20479
Tokens: 5145


##Implementing the DataLoader

In [45]:
GPT_CONFIG_124M = {
    "vocab_size" : 50257,
    "context_length" : 256,
    "emb_dim" : 768,
    "n_heads": 12, #no of attention heads
    "n_layers" : 12, #no_of transformers
    "drop_rate" : 0.1, #10% of neurons are set to 0
    "qkv_bias" :  False #not required bias term rn
}

In [46]:
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [47]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

Train/Validation ratio

In [48]:
train_ratio = 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

torch.manual_seed(123)

train_loader = create_dataloader_v1(
    train_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M['context_length'],
    stride=GPT_CONFIG_124M['context_length'],
    shuffle=True,
    drop_last=True,
    num_workers=0 #no parallel processing
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=2,
    max_length=GPT_CONFIG_124M['context_length'],
    stride=GPT_CONFIG_124M['context_length'],
    shuffle=True,
    drop_last=True,
    num_workers = 0 #no parallel processing
)



In [49]:
#Sanity check
if int(total_tokens * (train_ratio)) < GPT_CONFIG_124M["context_length"]:
  print("Not enough tokens for training loader")

if int(total_tokens * (1- train_ratio)) < GPT_CONFIG_124M["context_length"]:
  print("Not enough token for validation loader")

In [50]:
print("Train Loader:")
for x,y in train_loader:
  print(x.shape,y.shape)

Train Loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [51]:
print("Validation Loader")
for x,y in val_loader:
  print(x.shape,y.shape)

print(len(train_loader))
print(len(val_loader))

Validation Loader
torch.Size([2, 256]) torch.Size([2, 256])
9
1


In [52]:
#IN training data, 9 batches are created
#each batch has 2 samples
#One batch in validation data is created

#LLM Loss Function Implementation

In [53]:
def calc_loss_batch(input_batch, target_batch, model, device):
  input_batch,target_batch = input_batch.to(device),target_batch.to(device)
  logits = model(input_batch)
  loss = torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())
  return loss

def calc_loss_loader(data_loader,model,device,num_batches = None):
  total_loss = 0
  if len(data_loader) == 0:
    return float("nan")
  elif num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches,len(data_loader))
  for i,(input_batch,target_batch) in enumerate(data_loader):
    if i >= num_batches:
      break
    loss = calc_loss_batch(input_batch,target_batch,model,device)
    total_loss += loss.item()

  return total_loss / num_batches


In [54]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
torch.manual_seed(123)
with torch.no_grad():
  train_loss = calc_loss_loader(train_loader,model,device)
  val_loss = calc_loss_loader(val_loader,model,device)

print(train_loss)
print(val_loss)

11.000095155504015
11.001733779907227


##LLM Pretraining Loop

In [55]:
def evaluate_model(model,device,train_loader,val_loader,eval_iter):
  model.eval()
  with torch.no_grad():
    train_loss = calc_loss_loader(train_loader,model,device,num_batches = eval_iter)
    val_loss =   calc_loss_loader(val_loader,model,device,num_batches = eval_iter)
    model.train()
    return train_loss,val_loss

In [56]:
def generate_and_print_sample(model, tokenizer, device, start_context):
  model.eval()
  context_size = model.pos_emb.weight.shape[0]
  encoded = text_to_token_ids(start_context, tokenizer).to(device)
  with torch.no_grad():
   token_ids = generate_text_simple(
   model=model, idx=encoded,
   max_new_tokens=50, context_size=context_size
  )
  decoded_text = token_ids_to_text(token_ids, tokenizer)
  print(decoded_text.replace("\n", "")) # Compact print format
  model.train()

Let's train GPT model for 10 epochs using Adam W optimizer

In [72]:
#Initialize a list to track losses and tokens seen
def train_model_simple(model,train_loader,val_loader,optimizer,device,num_epochs,eval_iter,start_context,tokenizer,eval_freq):
    train_losses = []
    val_losses = []
    track_tokens_seen = []
    tokens_seen = 0
    global_step = -1
    for epoch in range(num_epochs):
      model.train() #Set the model to training mode
    for input_batch, target_batch in train_loader:
        optimizer.zero_grad()
        loss = calc_loss_batch(input_batch, target_batch, model, device)
        loss.backward() #Calculate loss gradient
        optimizer.step() #Update model weights using loss gradients
        tokens_seen += input_batch.numel() #returns total number of elements
        global_step += 1

    #Optional Evaluation step
    if global_step % eval_freq == 0:
      train_loss,val_loss = evaluate_model(model,device,train_loader,val_loader,eval_iter)
      train_losses.append(train_loss)
      val_losses.append(val_loss)
      track_tokens_seen.append(tokens_seen)
      print(f"Epoch: {epoch+1}(Step{global_step:06d}):" f"Train Loss: {train_loss:.3f}, Val Loss: {val_loss:.3f}")

generate_and_print_sample(model,tokenizer,device,start_context)





Every effort moves you Hearth cu Montanaovic McMaster convict mayorennett Mog considerableIdDallas bulb FallACE MBRest Thank mitigation splendid overflowopoly ArmenOrderグ DavidDesign COPH Shotrand Weiss mirrored Sor lineback bureau discounted instruct prefrontal Economy registration s proofDevice stagnShot simplicityChapter meg


In [None]:
import time
start_time = time.time()

torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr = 0.004, weight_decay = 0.1)

num_epochs = 10
train_losses,val_losses,tokens_seen = train_model_simple( model,train_loader,val_loader,optimizer,device, num_epochs = num_epochs,eval_freq = 5,eval_iter = 5,start_context = "Every effort moves you",tokenizer = tokenizer)

end_time = time.time()
executin_time_minutes = (end_time - start_time) / 60
print(f"Total execution time: {executin_time_minutes:.2f} minutes")