In [3]:

# Used for counting tokens 
!pip install tiktoken 

Collecting tiktoken
  Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires openai, which is not installed.[0m[31m
[0mSuccessfully installed tiktoken-0.5.2


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

import tiktoken

In [7]:
BATCH_SIZE=32
BLOCK_SIZE=8

encoder = tiktoken.get_encoding("gpt2")


dataset=[]
with open('/content/drive/MyDrive/interstellar_script.txt', "r", encoding='utf-8') as file:
    dataset=file.read()

# Creating the encoded ids using the encoder
encoded_ids=encoder.encode(dataset)
encoded_ids=torch.tensor(encoded_ids)


def get_batch():
    '''
    Returns a batch (x, y) from the dataset
    '''
    ind=torch.randint(len(encoded_ids)-BLOCK_SIZE, (BATCH_SIZE, ))

    x=torch.stack([encoded_ids[i:i+BLOCK_SIZE] for i in ind])
    y=torch.stack([encoded_ids[i+1:i+BLOCK_SIZE+1] for i in ind])

    x=torch.tensor(x, dtype=torch.long)
    y=torch.tensor(y, dtype=torch.long)



    return x, y

xb, yb = get_batch()
xb.shape, yb.shape

  x=torch.tensor(x, dtype=torch.long)
  y=torch.tensor(y, dtype=torch.long)


(torch.Size([32, 8]), torch.Size([32, 8]))

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Creating the Transformer

In [8]:
from dataclasses import dataclass

@dataclass
class Config:
    block_size: int = BLOCK_SIZE
    emb_dim: int = 256
    head_size: int = 32
    num_heads: int = 8
    num_layers: int = 2
    vocab_size: int = encoder.n_vocab # vocab size of the tokenizer

# We like to have emb_dim == head_size * num_heads
config = Config()
assert config.emb_dim == config.head_size * config.num_heads, "Embedding dimension must be divisible by number of heads"

### Multi-Head Self-Attention

This component is the core of the Transformer. This is where the model learns to attend to different parts of the input sequence, and is the reason why Transformers are so powerful.

For simplicity, assume we have a single Head:

1. The input has three parts extracted from it: the query $Q$, the key $K$, and the value $V$ (via projections or `Linear` layers).

2. The query and key are multiplied together to get a score. This score is then scaled by the square root of the embedding dimension, $\sqrt{d_k}$, then passed through a softmax to get the attention weights (*after* a masking operation is applied).

3. The attention weights are then multiplied with the value to get the final output.

When we extend this to *multiple heads*, we simply repeat this process for each head in parallel, and then concatenate the outputs of each head together.


In [9]:
import torch
import torch.nn as nn

class MHSA(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.emb_dim = cfg.emb_dim
        self.head_size = cfg.head_size
        self.block_size = cfg.block_size
        self.num_heads = cfg.num_heads

        self.projection = nn.Linear(self.emb_dim, 3 * self.emb_dim)

        self.out_projection = nn.Linear(self.emb_dim, self.emb_dim)

        self.scale = 1.0 / (self.head_size ** 0.5)

        self.register_buffer("mask", torch.tril(torch.ones(self.block_size, self.block_size)))

    def forward(self, x):
      B, T, C = x.shape
      H = self.num_heads

      qkv = self.projection(x)

      # print(qkv.shape)
      query, key, value = torch.split(qkv, self.emb_dim, dim=-1)

      query = query.view(B,T,H, self.head_size).transpose(1,2)
      # print("The query shape is: ", query.shape)
      key = key.view(B,T,H, self.head_size).transpose(1,2)
      # print("The key shape is: ", key.shape)
      value = value.view(B,T,H, self.head_size).transpose(1,2)
      # print("The val shape is: ", value.shape)

      attn_scores = torch.matmul(query, key.transpose(-1, -2)) * self.scale
      # print("The shape of attn score is: ", attn_scores.shape)
      attn_scores=attn_scores.masked_fill(self.mask[:T, :T]==0, float("-inf"))
      attn_probs = torch.softmax(attn_scores, dim=-1)

      weighted_sum = torch.matmul(attn_probs, value)

      weighted_sum = weighted_sum.transpose(1,2).reshape(B,T,-1)

      # Apply the final projection
      out = self.out_projection(weighted_sum)

      return out




x = torch.randn(32, config.block_size, config.emb_dim)
print(x.shape)
csa = MHSA(config)
out = csa(x)
print(out.shape)


torch.Size([32, 8, 256])
torch.Size([32, 8, 256])


### Feedforward Network 

In [10]:
class Feedforward(nn.Module):
    def __init__(self, config):
        super().__init__()
        emb_dim = config.emb_dim

        self.linear1 = nn.Linear(emb_dim, emb_dim * 2)
        self.linear2 = nn.Linear(emb_dim * 2, emb_dim * 2)
        self.linear3 = nn.Linear(emb_dim * 2, emb_dim)
        self.dropout = nn.Dropout(p=0.1)
        self.gelu = nn.GELU()

    def forward(self, x):

        x = self.linear1(x)
        x = self.gelu(x)
        x = self.linear2(x)
        x = self.gelu(x)
        x = self.linear3(x)
        x = self.dropout(x)
        return x

### Blocks with Skip Connections 


In [11]:
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.mhsa = MHSA(config)
        self.feedforward = Feedforward(config)
        self.layer_norm1 = nn.LayerNorm(config.emb_dim)
        self.layer_norm2 = nn.LayerNorm(config.emb_dim)

    def forward(self, x):

        residual1 = x
        x=self.layer_norm1(x)

        x = self.mhsa(x)
        x += residual1

        residual2 = x

        x=  self.layer_norm2(x)
        x = self.feedforward(x)
        x += residual2

        return x

x = torch.randn(8, config.block_size, config.emb_dim)
print(x.shape)
block = Block(config)
out = block(x)
print(out.shape)

torch.Size([8, 8, 256])
torch.Size([8, 8, 256])


### Putting it all together 

In [12]:
class SastaGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.emb_dim = config.emb_dim
        self.block_size = config.block_size
        self.num_layers = config.num_layers
        self.vocab_size = config.vocab_size

        # Word and position embeddings
        self.word_embeddings = nn.Embedding(self.vocab_size, self.emb_dim)
        self.position_embeddings = nn.Embedding(self.block_size, self.emb_dim)

        # Sequence of Blocks
        self.blocks = nn.ModuleList([Block(config) for _ in range(self.num_layers)])

        # Final LayerNorm
        self.final_layernorm = nn.LayerNorm(self.emb_dim)

        # Final linear layer (to get logits)
        self.final_linear = nn.Linear(self.emb_dim, self.vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear) or isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()

    def forward(self, idxs):

        positions = torch.arange(idxs.size(1), device=idxs.device).expand(idxs.size()).long()
        word_embeds = self.word_embeddings(idxs)
        position_embeds = self.position_embeddings(positions)

        embeddings = word_embeds + position_embeds

        for block in self.blocks:
            embeddings = block(embeddings)

        # Apply the final LayerNorm
        embeddings = self.final_layernorm(embeddings)

        # Apply the final linear layer to get the logits
        logits = self.final_linear(embeddings)

        return logits

    @torch.no_grad()
    def generate(self, idxs, max_new_tokens=20):
        '''
        Takes in a sequence of indices (the tokenized sentence) and generates new tokens
        Note that the input indices should not be longer than the block size
        Returns the input sequence with the generated tokens appended (these should be decoded using the Tokenizer)

        Params
        ------
        idxs: torch.Tensor
            (B, T) tensor of token indices
        max_new_tokens: int
            Maximum number of new tokens to generate
        '''

        for _ in range(max_new_tokens):
            idxs_trimmed = idxs[:, -self.block_size:]  # trim to block size

            logits = self(idxs_trimmed)  # (B, T, V)

            logits = logits[:, -1, :]  # (B, V)

            probs = F.softmax(logits, dim=-1)  # (B, V)

            next_idx = torch.multinomial(probs, num_samples=1)  # (B, 1)

            idxs = torch.cat((idxs, next_idx), dim=1)  # (B, T+1)

        return idxs

cfg = Config()
model = SastaGPT(cfg)
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.3f}M")


Number of parameters: 27.364M


In [13]:
xb, yb = get_batch()
print(xb.shape)

logits = model(xb)
print(logits.shape) 

torch.Size([32, 8])
torch.Size([32, 8, 50257])


  x=torch.tensor(x, dtype=torch.long)
  y=torch.tensor(y, dtype=torch.long)


In [14]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

model.train()


model = model.to(device)

num_epochs = 2000

total_iterations = 0
for epoch in range(num_epochs):

    xb, yb = get_batch()

    xb = xb.to(device)
    yb = yb.to(device)

    optimizer.zero_grad()

    preds = model(xb)

    loss = criterion(preds.view(-1, preds.size(-1)), yb.view(-1))

    loss.backward()

    optimizer.step()

    total_iterations += 1

    if total_iterations % 100 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}] , Loss: {loss.item()}")

cuda


  x=torch.tensor(x, dtype=torch.long)
  y=torch.tensor(y, dtype=torch.long)


Epoch [100/2000] , Loss: 1.9231761693954468
Epoch [200/2000] , Loss: 1.8101130723953247
Epoch [300/2000] , Loss: 2.201275110244751
Epoch [400/2000] , Loss: 1.6508311033248901
Epoch [500/2000] , Loss: 1.965787649154663
Epoch [600/2000] , Loss: 1.9189720153808594
Epoch [700/2000] , Loss: 2.2070119380950928
Epoch [800/2000] , Loss: 1.8348371982574463
Epoch [900/2000] , Loss: 1.4779247045516968
Epoch [1000/2000] , Loss: 1.6749982833862305
Epoch [1100/2000] , Loss: 1.4857405424118042
Epoch [1200/2000] , Loss: 1.2660691738128662
Epoch [1300/2000] , Loss: 1.3845367431640625
Epoch [1400/2000] , Loss: 1.1707236766815186
Epoch [1500/2000] , Loss: 1.431365966796875
Epoch [1600/2000] , Loss: 0.5711421370506287
Epoch [1700/2000] , Loss: 1.318962812423706
Epoch [1800/2000] , Loss: 1.8411682844161987
Epoch [1900/2000] , Loss: 1.1083104610443115
Epoch [2000/2000] , Loss: 1.108543872833252


In [16]:
sentence = "Outer space"
idxs = torch.tensor(encoder.encode(sentence)).unsqueeze(0).to(device)

model.eval()
generated = model.generate(idxs, max_new_tokens=1000)
res = encoder.decode(generated[0].cpu().numpy())
print(res)


Outer space suddenly on the black.
                    BRAND
      Brand looks over.
                          CASE (CONT'D) (CONT'D)
            Doyle looks at the bed, weapons
            count them, with ourtech fires the inner doors of the and
          'no gave with stars.
         DOYLE
           Cooper walks Pantagruel to happened to five farmer.
                                  
                               The rocket is no never important then pulls down of putting the
            Sure it. EXT. olderAY, CHINESEOUTH on
                    wormhole a military stretch of the bunker,. very iniously apart.
                         
                                  
                      
                   the window. PLAT'S FATHER
                Cooper can arms. He
                      INT. TRUCK -- DAY
                   But you'd care.
         
                          Tars may
                    tiny holes, then time. Don't
                     
                     


# Fin.