<a href="https://colab.research.google.com/github/sdeshmukh99/Generative-AI-Showcase/blob/main/Showcase_04/GPT_Decoder_for_Text_Generation_with_SST2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Package Installation and Imports

In [1]:
%%capture
!pip -q install transformers datasets

### Importing packages

In [2]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
import numpy as np
import matplotlib.pyplot as plt

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader

from datetime import datetime

# 1: Causal Sel Attention and Transformer Components

## 1.1: Causal Self Attention Class

In [3]:
class CausalSelfAttention(nn.Module):
  def __init__(self, d_k,d_model, n_heads, max_len):
    super().__init__()

    # Assume d_v =d_k
    self.d_k=d_k
    self.n_heads = n_heads

    self.key = nn. Linear (d_model, d_k*n_heads)
    self.query = nn. Linear (d_model, d_k*n_heads)
    self.value = nn. Linear (d_model, d_k*n_heads)

    # final linear layer
    self.fc=nn.Linear (d_k*n_heads, d_model)

    # casual mask
    # make it so that diagonal is 0
    # this way we don't have to shift the inputs to make targets

    cm=torch.tril(torch.ones(int(max_len),int(max_len)))
    self.register_buffer("causal_mask",  cm.view(1,1, int(max_len), int(max_len)))


  def forward(self, q, k, v, pad_mask=None):
    q=self.query(q) # N x T x (hd_k)
    k=self.key(k) # N x T x (hd_k)
    v=self.value(v) # N x T x (hd_k)

    N = q.shape[0]
    T = q.shape[1]


    # change the shape to:
    # (N, T, h, d_k) --> N, h, T, d_k)
    # in order for matrix multiply to work properly
    q=q.view (N, T, self.n_heads, self.d_k).transpose(1,2)
    k=k.view (N, T, self.n_heads, self.d_k).transpose(1,2)
    v=v.view (N, T, self.n_heads, self.d_k).transpose(1,2)

    # Copute attention weights
    # (N, h, T, d_k)   x  (N, h, d_k, T )  --> (N, h, T, T)
    attn_scores = q@k.transpose(-2,-1)/math.sqrt(self.d_k) # Scaled dot product;  @ --> torch.matmul

    if pad_mask is not None:
      attn_scores = attn_scores.masked_fill(pad_mask[:,None,None,:] == 0, float('-inf'))

    attn_scores = attn_scores.masked_fill(self.causal_mask[:, :, :T, :T] == 0, float('-inf'))

    attn_weights = F.softmax(attn_scores, dim =-1)

    # Compute attention-weighted values
    # (N, h, T, T) x (N, h, T, d_k) --> (N, h, T, d_k)
    A = attn_weights @ v

    # reshape it back before final linear layer
    A = A.transpose(1,2)  # (N, T, h, d_k)
    A = A.contiguous(). view(N, T, self.d_k*self.n_heads) # (N, T, h*d_k)

    # projection
    return self.fc(A)

## 1.2: Transformer Block Class

In [4]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob = 0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = CausalSelfAttention(d_k, d_model, n_heads, max_len)

    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model*4),
        nn.GELU(),
        nn.Linear(d_model*4, d_model),
        nn.Dropout(dropout_prob),
    )

    self.dropout = nn.Dropout(p=dropout_prob)

  def forward (self, x, pad_mask=None):
    x = self.ln1(x + self.mha(x, x, x, pad_mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return x

## 1.3: Positional Encoding Class

In [5]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len =2048, dropout_prob=0.1):
    super().__init__()
    self.dropout = nn.Dropout(p = dropout_prob)

    position = torch.arange(max_len).unsqueeze(1)
    i = torch.arange(0, d_model//2)
    pe = torch.zeros(1, max_len, d_model)
    pe[0, :, 0::2] = torch.sin(position / (10000)**(2*i/d_model))
    pe[0, :, 1::2] = torch.cos(position / (10000)**(2*i/d_model))
    self.register_buffer('pe', pe)    # If you have parameters in your model which should be saved and restored in the state_dict
                                      # but not trained by the optimizer, then you should register them as buffers.
                                      # Buffers won’t be returned in model.parameters(), so that the optimizer won’t have a change to update them.

  def forward(self, x):
    # x.shape : N x T x D
    x = x + self.pe[:, :x.size(1), :]
    return self.dropout(x)

# 2: Decoder-Only Transformer (GPT) Architecture

## 2.1: Decoder Class

In [6]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, max_len, d_k, d_model, n_heads, n_layers, dropout_prob):
    super().__init__()

    self.embedding=nn.Embedding(vocab_size, d_model)

    self.pos_encoding=PositionalEncoding(d_model, max_len, dropout_prob)

    transformer_blocks=[TransformerBlock(d_k, d_model, n_heads, max_len, dropout_prob) for _ in range(n_layers) ]

    self.transformer_blocks = nn.Sequential(*transformer_blocks)

    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, vocab_size)

  def forward(self, x, pad_mask=None):
    x=self.embedding(x)
    x=self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, pad_mask)
    x = self.ln(x)
    x = self.fc(x)  # many-to-many
    return x

## 2.2: Testing the Decoder with Dummy Data

In [7]:
model = Decoder(vocab_size = 20_000, max_len=1024, d_k=16, d_model = 64, n_heads = 4,n_layers=2, dropout_prob =0.1)
device =torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Creating device for GPU and moving model to GPU
print(device)
model.to(device)

cuda:0


Decoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05,

In [8]:
x = np.random.randint(0,20_000,size=(8,512)) # Batch size of eaight & sequence length of 512, vocab size is 20k
# Token id may be anywhere between 0 to 20_000 exclusive
x_t = torch.tensor(x).to(device)

In [9]:
y = model(x_t) # without padding
y.shape

torch.Size([8, 512, 20000])

In [10]:
mask = np.ones((8,512)) # with padding mask
mask[:, 256:] = 0
mask_t = torch.tensor(mask).to(device)

In [11]:
y=model(x_t,mask_t) #with mask
y.shape

torch.Size([8, 512, 20000])

# 3: Training the Decoder as Causal Language Model

## 3.1 Load and Tokenize the Data

### 3.1.1: Load the Data

In [12]:
raw_datasets = load_dataset("glue", "sst2") # sst2 is DataSet for sentiment analysis but ignore the label

README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

### 3.1.2: Define the Tokenizer

In [13]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



In [14]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation = True)

### 3.1.3: Tokenize the Data

In [15]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched = True)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [16]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [17]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx", "label"])

In [18]:
train_loader =  DataLoader(tokenized_datasets["train"], shuffle = True, batch_size =32, collate_fn = data_collator)

### 3.1.4: Check How it Works

In [19]:
for batch in train_loader:
  for k,v in batch.items():
    print("k:", k, "v.shape:", v.shape)
  break

k: input_ids v.shape: torch.Size([32, 51])
k: attention_mask v.shape: torch.Size([32, 51])


In [20]:
tokenizer.pad_token_id

0

In [21]:
tokenizer.model_max_length

512

In [22]:
tokenizer.vocab_size

28996

## 3.2: Training Process

### 3.2.1: Create the Model

In [23]:
model = Decoder (
    vocab_size = tokenizer.vocab_size,
    max_len=tokenizer.model_max_length,
    d_k=16,
    d_model = 64,
    n_heads = 4,
    n_layers =2,
    dropout_prob =0.1
    )
model.to(device)

Decoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05,

In [24]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = torch.optim.Adam(model.parameters())

### 3.2.2: Train The Model

In [25]:
def train(model, criterion, optimzer, train_loader, epochs):
  train_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0=datetime.now()
    train_loss = []
    for batch in train_loader:
      # move data to GPU
      batch = {k:v.to(device) for k, v, in batch.items()}

      # zero the parameter gradients
      optimizer.zero_grad()

      # shift the targets backwards
      targets = batch['input_ids'].clone().detach()
      targets = torch.roll(targets,shifts = -1, dims = 1) #Shifting the sequence for creating target
      targets[:,-1] = tokenizer.pad_token_id

      # forward pass
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = criterion(outputs.transpose(2,1),targets)

      # Backward and optimize
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())

    # get  train Loss and test loss
    train_loss = np.mean(train_loss)

    # save lossess
    train_losses[it] = train_loss

    dt = datetime.now() - t0
    print(f'Epoch: {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Duration: {dt}')

  return train_losses

In [26]:
train_losses = train(model, criterion, optimizer, train_loader, epochs = 4)

Epoch: 1/4, Train Loss: 5.9727, Duration: 0:01:01.420382
Epoch: 2/4, Train Loss: 5.0215, Duration: 0:00:59.562988
Epoch: 3/4, Train Loss: 4.6857, Duration: 0:00:58.649764
Epoch: 4/4, Train Loss: 4.4972, Duration: 0:00:59.017219


# 4: Validation and Text Generation

## 4.1: Validation Process

In [27]:
valid_loader = DataLoader(tokenized_datasets["validation"], batch_size = 1, collate_fn = data_collator)

In [28]:
model.eval()
for batch in valid_loader:
  # move data to GPU
  batch = {k:v.to(device) for k,v in batch.items()}
  outputs = model (batch['input_ids'], batch['attention_mask'])
  break

## 4.2: Sample Validation Output

In [29]:
batch['input_ids']

tensor([[  101,  1122,   112,   188,   170, 14186,  1105,  1510, 12759,  5012,
           119,   102]], device='cuda:0')

In [30]:
tokenizer.decode(batch['input_ids'][0])

"[CLS] it's a charming and often affecting journey. [SEP]"

In [31]:
outputs.shape

torch.Size([1, 12, 28996])

In [32]:
torch.argmax(outputs,axis=-1)

tensor([[ 170,  112,  188,  170, 1363, 1105,  170, 6276,  102,  102,  102,  102]],
       device='cuda:0')

In [33]:
prediction_ids = torch.argmax(outputs, axis=-1)

In [34]:
prediction_ids

tensor([[ 170,  112,  188,  170, 1363, 1105,  170, 6276,  102,  102,  102,  102]],
       device='cuda:0')

In [35]:
tokenizer.decode(prediction_ids[0])

"a's a good and a funny [SEP] [SEP] [SEP] [SEP]"

## 4.3: Extracting the Next Word Prediction from the Output

1. Output hepls us understand chance of occurance for each token
2. Extract the token id with max chance
3. Decode the extracted to token to interpret the token

In [36]:
outputs[:,-1,:].shape

torch.Size([1, 28996])

In [37]:
outputs[:,-1,:]

tensor([[-14.3070, -13.8440, -13.8463,  ..., -13.4751, -14.5149, -13.9966]],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [38]:
torch.argmax(outputs[:, -1, :], axis=-1)

tensor([102], device='cuda:0')

In [39]:
tokenizer.decode(torch.argmax(outputs[:,-1,:], axis = -1))

'[SEP]'

# 5: Generating Text Using the Model

In [40]:
prompt ="it's a"
tokenized_prompt = tokenizer(prompt, return_tensors='pt')
# prepare inputs + get rid of SEP token at the end
input_ids = tokenized_prompt['input_ids'][:,:-1].to(device)
mask = tokenized_prompt['attention_mask'][:,:-1].to(device)

for _ in range(20):
  outputs = model(input_ids, mask)
  prediction_id = torch.argmax(outputs[:,-1,:], axis = -1)

  input_ids = torch.hstack((input_ids, prediction_id.view(1,1)))
  mask = torch.ones_like(input_ids)

  if prediction_id == tokenizer.sep_token_id:
    break

In [41]:
tokenizer.decode(input_ids[0])

"[CLS] it's a good deal of the film [SEP]"