In [4]:
import torch.nn as nn
import torch
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
      super().__init__()
      assert (d_out % num_heads == 0), \
          "d_out must be divisible by num_heads"
      self.d_out = d_out
      self.num_heads = num_heads
      self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim
      self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
      self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
      self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
      self.out_proj = nn.Linear(d_out,d_out)
      self.dropout = nn.Dropout(dropout)
      self.register_buffer(
          "mask",
          torch.triu(torch.ones(context_length,context_length),diagonal=1)
      )
    def forward(self,x):
      b , num_tokens, d_in = x.shape
      print(x.shape)
      queries = self.W_query(x).view(b,num_tokens,self.num_heads,self.head_dim)
      keys = self.W_key(x).view(b,num_tokens,self.num_heads,self.head_dim)
      values = self.W_value(x).view(b,num_tokens,self.num_heads,self.head_dim)
      # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
      keys = keys.transpose(1,2)
      values = values.transpose(1,2)
      queries = queries.transpose(1,2)


      attention_scores = queries @ keys.transpose(-1,-2)
      # Original mask truncated to the number of tokens and converted to boolean
      mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
      attention_scores.masked_fill_(mask_bool, -torch.inf)
      attn_weights = torch.softmax(attention_scores / keys.shape[-1]**0.5, dim=-1)

      attn_weights = self.dropout(attn_weights)

      # Shape: (b, num_tokens, num_heads, head_dim)
      context_vec = (attn_weights @ values).transpose(1, 2)

      # Combine heads, where self.d_out = self.num_heads * self.head_dim
      context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
      context_vec = self.out_proj(context_vec) # optional projection

      return context_vec


In [5]:
d_in,d_out =6,6
attention = MultiHeadAttention(d_in,d_out,6,0.25,2)

In [6]:

class DummyGPTModel(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.token_emb = nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
    self.pos_emb = nn.Embedding(cfg["context_length"],cfg["emb_dim"])
    self.drop_emb = nn.Dropout(cfg["drop_rate"])
    self.trans_blocks = nn.Sequential(
        *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]
    )
    self.final_norm = DummyLayerNorm(cfg["emb_dim"])
    self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )
  def forward(self,in_idx):
      batch_size , seq_len = in_idx.shape
      token_emb = self.token_emb(in_idx)
      pos_emb = self.pos_emb(torch.arange(seq_len,device=in_idx.device))
      x = token_emb + pos_emb
      x = self.drop_emb(x)
      x = self.trans_blocks(x)
      x = self.final_norm(x)
      logits = self.out_head(x)
      return logits
class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # A simple placeholder

    def forward(self, x):
        # This block does nothing and just returns its input.
        return x


class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        # The parameters here are just to mimic the LayerNorm interface.

    def forward(self, x):
        # This layer does nothing and just returns its input.
        return x

In [7]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [8]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)


**Layer Norm**

In [9]:
torch.manual_seed(123)
inputs = torch.randn(2,5)
layers = nn.Sequential(nn.Linear(5,6),nn.ReLU())
outputs = layers(inputs)
outputs

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)

In [10]:
mean  = outputs.mean(dim=-1,keepdim=True)
var = outputs.var(dim=-1,keepdim=True)

In [11]:
out_norm  = (outputs - mean)/torch.sqrt(var)
mean  = out_norm.mean(dim=-1,keepdim=True)
var = out_norm.var(dim=-1,keepdim=True)
var

tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)

In [12]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5
    def forward(self,x):
        means = x.mean(dim=-1,keepdim=True)
        vars = x.var(dim=-1,keepdim=True)
        norm_vals = (x - means)/torch.sqrt(vars+self.eps)
        return self.scale*norm_vals + self.shift*norm_vals

In [13]:
norm = LayerNorm(emb_dim=5)
inputs = torch.randn(2,5,5)
norm.forward(inputs)

tensor([[[ 0.2282, -1.3700, -0.3207,  0.0610,  1.4016],
         [-0.8846,  0.3039,  0.2648,  1.3837, -1.0678],
         [ 0.9278,  0.3431,  0.3016, -1.7066,  0.1341],
         [ 0.2733,  1.2591,  0.4963, -1.1476, -0.8812],
         [-0.1139, -0.4737,  0.0476,  1.6144, -1.0743]],

        [[-0.3018, -1.2715,  0.1537,  1.5038, -0.0842],
         [ 0.6037, -1.0267,  1.1283,  0.3752, -1.0805],
         [-1.2706,  1.2019, -0.4173, -0.3266,  0.8126],
         [-0.8886, -0.3835,  0.8068, -0.8407,  1.3060],
         [-0.0051, -1.1286,  0.0712,  1.5697, -0.5071]]],
       grad_fn=<AddBackward0>)

***Feed Forward Layer***

In [14]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.feed_forward = nn.Sequential(nn.Linear(cfg["emb_dim"],cfg["emb_dim"]*4),nn.GELU(),nn.Linear(cfg["emb_dim"]*4,cfg["emb_dim"]))
    def forward(self,x):
        return self.feed_forward(x)

In [15]:
feed = FeedForward(cfg=GPT_CONFIG_124M)
feed(torch.randn(2,3,768)).shape

torch.Size([2, 3, 768])

**Skip/Shortcut Connections**

In [16]:
class ExDeepNeuralNet(nn.Module):
    def __init__(self, sizes,skip):
        super().__init__()
        self.skip = skip
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(sizes[0],sizes[1]),nn.GELU()),
nn.Sequential(nn.Linear(sizes[1],sizes[2]),nn.GELU()),
nn.Sequential(nn.Linear(sizes[2],sizes[3]),nn.GELU()),
nn.Sequential(nn.Linear(sizes[3],sizes[4]),nn.GELU()),
nn.Sequential(nn.Linear(sizes[4],sizes[5]),nn.GELU()),
        ])
    def forward(self,x):
        for layer in self.layers:
            outputs=layer(x)
            if self.skip and x.shape == outputs.shape:
                x = x + outputs
            else:
                x = outputs
        return x


In [17]:
sizes = [3,3,3,3,3,1]
sample_in = torch.tensor([1.,0.,-1.])
torch.manual_seed(123)
net_without_shortcut = ExDeepNeuralNet(
    sizes=sizes,skip=False
)

In [18]:
def print_grad(model,x):
    outputs = model(x)
    truth = torch.tensor([0.])

    loss = nn.MSELoss()
    loss = loss(outputs,truth)

    loss.backward()

    for name,param in model.named_parameters():
        if 'weight' in name:
            print(f"{name} has gradient mean of {param.grad.abs().mean().item()}")

In [19]:
print_grad(net_without_shortcut,sample_in)

layers.0.0.weight has gradient mean of 0.00020174124801997095
layers.1.0.weight has gradient mean of 0.00012011774379061535
layers.2.0.weight has gradient mean of 0.0007152438047342002
layers.3.0.weight has gradient mean of 0.0013988513965159655
layers.4.0.weight has gradient mean of 0.005049603525549173


In [20]:
net_with_shortcut = ExDeepNeuralNet(
    sizes=sizes,skip=True
)
print_grad(net_with_shortcut,sample_in)

layers.0.0.weight has gradient mean of 0.0014336216263473034
layers.1.0.weight has gradient mean of 0.0048200166784226894
layers.2.0.weight has gradient mean of 0.004116077441722155
layers.3.0.weight has gradient mean of 0.005884398240596056
layers.4.0.weight has gradient mean of 0.032491881400346756


In [21]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        self.eps = 1e-5
    def forward(self,x):
        means = x.mean(dim=-1,keepdim=True)
        vars = x.var(dim=-1,keepdim=True)
        norm_vals = (x - means)/torch.sqrt(vars+self.eps)
        return self.scale*norm_vals + self.shift*norm_vals
class FeedForward(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.feed_forward = nn.Sequential(nn.Linear(cfg["emb_dim"],cfg["emb_dim"]*4),nn.GELU(),nn.Linear(cfg["emb_dim"]*4,cfg["emb_dim"]))
    def forward(self,x):
        return self.feed_forward(x)
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
      super().__init__()
      assert (d_out % num_heads == 0), \
          "d_out must be divisible by num_heads"
      self.d_out = d_out
      self.num_heads = num_heads
      self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim
      self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
      self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
      self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
      self.out_proj = nn.Linear(d_out,d_out)
      self.dropout = nn.Dropout(dropout)
      self.register_buffer(
          "mask",
          torch.triu(torch.ones(context_length,context_length),diagonal=1)
      )
    def forward(self,x):
      b , num_tokens, d_in = x.shape

      queries = self.W_query(x).view(b,num_tokens,self.num_heads,self.head_dim)
      keys = self.W_key(x).view(b,num_tokens,self.num_heads,self.head_dim)
      values = self.W_value(x).view(b,num_tokens,self.num_heads,self.head_dim)
      # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
      keys = keys.transpose(1,2)
      values = values.transpose(1,2)
      queries = queries.transpose(1,2)


      attention_scores = queries @ keys.transpose(-1,-2)
      # Original mask truncated to the number of tokens and converted to boolean
      mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
      attention_scores.masked_fill_(mask_bool, -torch.inf)
      attn_weights = torch.softmax(attention_scores / keys.shape[-1]**0.5, dim=-1)

      attn_weights = self.dropout(attn_weights)

      # Shape: (b, num_tokens, num_heads, head_dim)
      context_vec = (attn_weights @ values).transpose(1, 2)

      # Combine heads, where self.d_out = self.num_heads * self.head_dim
      context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
      context_vec = self.out_proj(context_vec) # optional projection

      return context_vec


In [22]:
class Transformer(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.ff = FeedForward(cfg=cfg)
        self.norm1 = LayerNorm(emb_dim=cfg["emb_dim"])
        self.norm2 = LayerNorm(emb_dim=cfg["emb_dim"])
        self.drop = nn.Dropout(cfg["drop_rate"])
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"], 
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
    def forward(self,x):
        #shortcut for attenttoin block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop(x)
        x = x + shortcut
        return x


In [23]:
transformer = Transformer(GPT_CONFIG_124M)

transformer(torch.randn(3,1024,768))

tensor([[[ 0.3040,  0.5495,  1.2885,  ..., -1.8107,  2.3743, -0.8244],
         [-1.8463, -0.1477, -1.0222,  ..., -0.3995,  0.9893,  1.4344],
         [-0.8399,  0.0234, -0.2272,  ..., -0.7366, -0.1974,  1.0353],
         ...,
         [ 0.5701, -0.1126,  0.9068,  ...,  0.5838,  0.0046, -1.4481],
         [ 0.9650, -0.3691,  0.7122,  ..., -0.6536, -0.0895, -0.1681],
         [ 0.1567, -0.4485,  0.9260,  ...,  0.4075, -0.8095,  1.7327]],

        [[-0.5228, -1.1788, -0.1235,  ...,  0.5532,  0.0343, -0.2192],
         [-1.2015,  2.1145,  1.8952,  ...,  0.0552,  0.8150,  0.9406],
         [ 0.4222,  1.6837,  0.1903,  ...,  0.4998,  0.4750, -0.4381],
         ...,
         [ 0.8550, -0.5851, -0.4771,  ...,  0.9347, -0.8972,  0.1981],
         [-0.8054,  2.1730, -2.7927,  ...,  1.1397,  1.0180,  0.6697],
         [-2.1625,  0.7646, -0.0386,  ..., -1.1390,  0.2489,  0.2810]],

        [[-0.2117, -0.8995,  0.6468,  ...,  1.3401, -1.7545, -1.9538],
         [ 1.0441,  0.8597,  0.1974,  ...,  0

In [24]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}
class GPT2(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.token_emb = nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"],cfg["emb_dim"])

        self.drop = nn.Dropout(cfg["drop_rate"])
        self.transformers = nn.ModuleList(Transformer(cfg=cfg) for _ in range(cfg["n_layers"]))
        self.norm = LayerNorm(cfg["emb_dim"])
        self.out = nn.Linear(cfg["emb_dim"],cfg["vocab_size"])
    def forward(self,x):
        _ , seq_len = x.shape
        x = self.pos_emb(torch.arange(seq_len)) + self.token_emb(x)
        x = self.drop(x)
        for layer in self.transformers:
            x = layer(x)
        x = self.norm(x)
        return self.out(x)

In [25]:
import torch
gpt2 = GPT2(GPT_CONFIG_124M)
inputs = torch.randint(low=0, high=50257, size=(2, 4))  # integers between 0 and 9
gpt2(inputs).shape

torch.Size([2, 4, 50257])

In [26]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    # idx is (batch, n_tokens) array of indices in the current context

    ###Input batch:
 ###tensor([[6109, 3626, 6100,  345],
        ##[6109, 1110, 6622,  257]])
    
    for _ in range(max_new_tokens):
        
        # Crop current context if it exceeds the supported context size
        # E.g., if LLM supports only 5 tokens, and the context size is 10
        # then only the last 5 tokens are used as context
        idx_cond = idx[:, -context_size:]
        
        # Get the predictions
        with torch.no_grad():
            logits = model(idx_cond) ### batch, n_tokens, vocab_size
        
        # Focus only on the last time step
        # (batch, n_tokens, vocab_size) becomes (batch, vocab_size)
        logits = logits[:, -1, :]  

        # Apply softmax to get probabilities
        probas = torch.softmax(logits, dim=-1)  # (batch, vocab_size)

        # Get the idx of the vocab entry with the highest probability value
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # (batch, 1)

        # Append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)

    return idx

In [27]:
start_context = "Hello, I am"
encoded = tokenizer.encode(start_context)
print("encoded:", encoded)
encoded_tensor = torch.tensor(encoded).unsqueeze(0) #A
print("encoded_tensor.shape:", encoded_tensor.shape)
model.eval() #A
#model = GPTModel(GPT_CONFIG_124M)
out = generate_text_simple(
model=gpt2,
idx=encoded_tensor,
max_new_tokens=6,
context_size=GPT_CONFIG_124M["context_length"]
)
print("Output:", out)
print("Output length:", len(out[0]))

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])
Output: tensor([[15496,    11,   314,   716,   407, 14495, 30457, 27415, 26328, 48690]])
Output length: 10


In [28]:
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text)

Hello, I am not peersBanARS Wolver fracturing


In [29]:
import torch

GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}

torch.manual_seed(123)
model = GPT2(GPT_CONFIG_124M)
model.eval();  # Disable dropout during inference

**Loss Calculation**

In [30]:
import tiktoken

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves youahl supporter minersEnabledUm felateral Narr tested evil


In [31]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]

In [32]:
with torch.no_grad():
    logits = model(inputs)
probs = torch.softmax(logits,dim=-1)
print(probs.shape)


torch.Size([2, 3, 50257])


In [33]:
token_ids = torch.argmax(probs,dim=-1,keepdim=True)
token_ids

tensor([[[43308],
         [44213],
         [ 6957]],

        [[38913],
         [48204],
         [15442]]])

In [34]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  NXTNaturally 95


In [35]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"]

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107,  588, 11311]]) #  " really like chocolate"]
text_idx = 0
target_probs_1 = probs[text_idx,[0,1,2],targets[text_idx]]

text_idx = 1
target_probs_2 = probs[text_idx,[0,1,2],targets[text_idx]] # 0,1,2 is because the context len we are seeing here is 3

In [36]:
log_probs = torch.log(torch.cat((target_probs_1,target_probs_2)))
negative_log_likellihood = torch.mean(log_probs)*-1
negative_log_likellihood

tensor(10.8587)

In [37]:
logits = logits.flatten(0,1)
targets = targets.flatten()

In [38]:
nn.functional.cross_entropy(logits,targets)

tensor(10.8587)

In [39]:
with open("/Users/joyboy/Downloads/LLM'S_from_Scratch/the-verdict.txt","r") as f:
    text = f.read()

In [40]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [41]:
from torch.utils.data import Dataset,DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self,text,tokenizer,max_len,stride):
        super().__init__()
        self.input_ids = []
        self.target_ids = []

        self.token_ids = tokenizer.encode(text,allowed_special={"<|endoftext|>"})

        for i in range(0,len(self.token_ids)-max_len,stride):
            input_chunk = self.token_ids[i:i+max_len]
            target_chunk = self.token_ids[i+1:i+max_len+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, index):
        return self.input_ids[index],self.target_ids[index]

In [42]:
def create_dataloader(txt,batch_size = 4,max_len = 256,stride = 128,shuffle = True,drop_last = True,num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt,tokenizer,max_len,stride)

    return  DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

In [43]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}
# Train/validation ratio
train_ratio = 0.90
split_idx = int(train_ratio * len(text))
train_data = text[:split_idx]
val_data = text[split_idx:]

train_loader = create_dataloader(
    train_data,2,GPT_CONFIG_124M["context_length"]
    ,stride=GPT_CONFIG_124M["context_length"]
    ,drop_last=False,shuffle=False,num_workers=0
)
valid_loader = create_dataloader(
    val_data,2,GPT_CONFIG_124M["context_length"]
    ,stride=GPT_CONFIG_124M["context_length"]
    ,drop_last=False,shuffle=False,num_workers=0
)

In [44]:
for x,y in train_loader:
    print(x.shape,y.shape)

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [45]:
torch.manual_seed(123)
gpt2 = GPT2(GPT_CONFIG_124M)
model.eval()

GPT2(
  (token_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (transformers): ModuleList(
    (0-11): 12 x Transformer(
      (ff): FeedForward(
        (feed_forward): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop): Dropout(p=0.1, inplace=False)
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (norm): LayerNorm()
  (out): Linear(in_features=768, out_features=50257, bias

In [46]:

def calc_loss_batch(input_batch, target_batch, model, device):
    # input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss


def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [47]:

calc_loss_loader(train_loader,model,device="mps")

11.005967881944445

In [48]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer)
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()
def train_model(model,train_loader,valid_loader,optimizer,device,num_epochs,eval_freq,eval_iter,start_context,tokenizer):
    train_loses,valid_loses,track_tokens = [],[],[]
    tokens_seen , global_step =0, -1

    for epoch in range(num_epochs):
        model.train()
        for input_batch,target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(model=model,input_batch=input_batch,target_batch=target_batch,device=device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step+=1

            if global_step % eval_freq==0:
                train_loss, val_loss = evaluate_model(model,train_loader,valid_loader,device,eval_iter)
                train_loses.append(train_loss)
                valid_loses.append(val_loss)
                track_tokens.append(tokens_seen)
                print(f"Epoch {epoch}: Train loss: {train_loss} Valid Loss: {val_loss}")
             # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, start_context
        )
    return train_loses,valid_loses

In [None]:
torch.manual_seed(123)
model = GPT2(GPT_CONFIG_124M)
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0004,weight_decay=0.1)
num_epochs = 10
train_model(model,train_loader,valid_loader,optimizer,"mps",num_epochs,5,5,"Every effort moves you",tokenizer)

In [50]:
model.eval()
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids("Every effort moves you",tokenizer),
    max_new_tokens=25,
    context_size = GPT_CONFIG_124M["context_length"]
)
out = token_ids_to_text(token_ids,tokenizer)
out

'Every effort moves youahl supporter minersEnabledUm felateral Narr tested evilconom silicone Mavericks EquityKill Rath cuts RBI slipperyhuge ZambublishedendmentCharles aerobic'

In [51]:
vocab = { 
    "closer": 0,
    "every": 1, 
    "effort": 2, 
    "forward": 3,
    "inches": 4,
    "moves": 5, 
    "pizza": 6,
    "toward": 7,
    "you": 8,
} 

inverse_vocab = {v: k for k, v in vocab.items()}
next_token_logits = torch.tensor(
[4.51, 0.89, -1.90, 6.75, 1.63, -1.62, -1.89, 6.28, 1.79]
)

**This is what we are doing untill now taking argmax**

In [52]:
probs = torch.softmax(next_token_logits,dim=-1)
next_tokon = torch.argmax(probs)
print(probs)
print(next_tokon.item())
print(inverse_vocab[next_tokon.item()])

tensor([6.0907e-02, 1.6313e-03, 1.0019e-04, 5.7212e-01, 3.4190e-03, 1.3257e-04,
        1.0120e-04, 3.5758e-01, 4.0122e-03])
3
forward


In [53]:
probs = torch.softmax(next_token_logits,dim=-1)
next_tokon = torch.multinomial(probs,num_samples=1)
print(probs)
print(next_tokon.item())
print(inverse_vocab[next_tokon.item()])

tensor([6.0907e-02, 1.6313e-03, 1.0019e-04, 5.7212e-01, 3.4190e-03, 1.3257e-04,
        1.0120e-04, 3.5758e-01, 4.0122e-03])
3
forward


In [54]:
top_k_logits , top_k_tokens = torch.topk(next_token_logits,k=3)

In [55]:
top_k_logits

tensor([6.7500, 6.2800, 4.5100])

In [56]:
top_k_logits = torch.where(
    condition=next_token_logits< top_k_logits[-1],
    input=torch.tensor(float("-inf")),
    other=next_token_logits
)

In [57]:
probs = torch.softmax(top_k_logits,dim=-1)
probs

tensor([0.0615, 0.0000, 0.0000, 0.5775, 0.0000, 0.0000, 0.0000, 0.3610, 0.0000])

In [58]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx


In [59]:
model = model.eval()
token_ids_to_text(generate(
    model=model,
    idx= text_to_token_ids("Every effort moves you",tokenizer)
    ,max_new_tokens=15,
    context_size=GPT_CONFIG_124M["context_length"],
    top_k=25,
    temperature=1.4
),tokenizer=tokenizer)

'Every effort moves you continuous caramel NAFTA Michaelarget Alienテ gladly finalized Districtutf demandedpotion PattyNice'

In [60]:
generate_and_print_sample(
            model, tokenizer, "mps", "Every effort moves you"
        )

Every effort moves youahl supporter minersEnabledUm felateral Narr tested evilconom silicone Mavericks EquityKill Rath cuts RBI slipperyhuge ZambublishedendmentCharles aerobic ," benefited ConversER Revis misleading cookingZI adapting Music uniquely Hezbollah nineteendedusionaldk bug fence BE geneticsDefaultisec worldongyang628


In [None]:
torch.save({
    "model_state":model.state_dict(),
    "optimizer_state":optimizer.state_dict(),},
    "model_and_optimizer.pth"
)

In [None]:
checkpoint = torch.load("model_and_optimizer.pth",weights_only=False)
gpt2 = GPT2(GPT_CONFIG_124M)
gpt2.load_state_dict(checkpoint["model_state"])
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0004,weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state"])
gpt2.train()

In [None]:
num_epochs = 5
train_model(gpt2,train_loader,valid_loader,optimizer,"mps",num_epochs,5,5,"Every effort moves you",tokenizer)