# Custom Transformer-Based LLM for Text Summarization

## Step 1: Core Components of the Transformer

### 1.1. Positional Encoding

In [110]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(torch.log(torch.tensor(10000.0)) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1)].to(x.device)


### 1.2. Multi-Head Attention Mechanism

In [111]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        # Linear transformations and reshape
        query = self.q_linear(query).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        key = self.k_linear(key).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        value = self.v_linear(value).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        # Scaled dot-product attention
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attention = self.softmax(scores)
        out = torch.matmul(attention, value).transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)

        return self.fc_out(out)

### 1.3. Feed-Forward Networks

In [112]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.dropout(self.relu(self.linear1(x))))


### 1.4. Encoder Layer

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # Reshaping mask for broadcasting (if necessary)
        mask = mask.unsqueeze(1).unsqueeze(2)  # Add dimensions for broadcasting (batch_size, 1, 1, seq_len)
        
        # Multi-head attention
        attn_out = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))

        # Feed-forward network
        ffn_out = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_out))

        return x


### 1.5. Decoder Layer

In [113]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attention = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, tgt_mask):
        # Self-attention
        self_attn_out = self.self_attention(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn_out))

        # Encoder-decoder attention
        enc_dec_attn_out = self.enc_dec_attention(x, enc_out, enc_out, src_mask)
        x = self.norm2(x + self.dropout(enc_dec_attn_out))

        # Feed-forward network
        ffn_out = self.ffn(x)
        x = self.norm3(x + self.dropout(ffn_out))
        return x


## Custom Modifications

### 1. Hierarchical Attention

In [114]:
class HierarchicalAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(HierarchicalAttention, self).__init__()
        self.token_attention = MultiHeadAttention(d_model, num_heads)
        self.sentence_attention = MultiHeadAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, token_representations, sentence_representations, token_mask=None, sentence_mask=None):
        # Token-level attention
        token_context = self.token_attention(
            token_representations, token_representations, token_representations, token_mask
        )
        token_context = self.norm1(token_representations + token_context)

        # Sentence-level attention
        sentence_context = self.sentence_attention(
            sentence_representations, sentence_representations, sentence_representations, sentence_mask
        )
        sentence_context = self.norm2(sentence_representations + sentence_context)

        return token_context, sentence_context


### 2. Gated Linear Units (GLU) in Feed-Forward Networks

In [115]:
class GatedFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(GatedFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_model, d_ff)
        self.linear3 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        gate = torch.sigmoid(self.linear2(x))
        return self.linear3(self.dropout(self.linear1(x) * gate))


### 3. Sparse Attention

In [116]:
class SparseAttention(nn.Module):
    def __init__(self, d_model, num_heads, window_size):
        super(SparseAttention, self).__init__()
        self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
        self.window_size = window_size

    def forward(self, query, key, value, mask=None):
        batch_size, seq_len, _ = query.size()

        # Split into windows
        windows = seq_len // self.window_size
        sparse_mask = mask[:, :, :windows * self.window_size]
        sparse_query = query[:, :windows * self.window_size, :]
        sparse_key = key[:, :windows * self.window_size, :]
        sparse_value = value[:, :windows * self.window_size, :]

        # Apply attention only to windows
        return self.multi_head_attention(sparse_query, sparse_key, sparse_value, sparse_mask)


## Step 2: Full Transformer Architecture

### Initialize the model with the custom transformer architecture

In [117]:
class CustomEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(CustomEncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sentence_representations, token_mask, sentence_mask):
        # Self-attention
        attn_out = self.attention(x, x, x, token_mask)
        x = self.norm1(x + self.dropout(attn_out))

        # Feed-forward network
        ffn_out = self.ffn(x)
        x = self.norm2(x + self.dropout(ffn_out))

        return x, sentence_representations

### Combine the encoder and decoder layers

In [None]:
class CustomTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_len, dropout=0.1):
        super(CustomTransformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)

        # Use CustomEncoderLayer instead of EncoderLayer
        self.encoder = nn.ModuleList([CustomEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder = nn.ModuleList([CustomEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, tgt, src_mask, tgt_mask):
        # Ensure masks have the correct shape
        src_mask = src_mask.unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_len)
        tgt_mask = tgt_mask.unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, seq_len)

        # Encoder
        src = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        sentence_representations = src.mean(dim=1, keepdim=True)  # Example of hierarchical representation (e.g., averaging tokens per sentence)

        # Pass through custom encoder layers
        for layer in self.encoder:
            src = layer(src, src_mask)

        # Decoder
        tgt = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))
        for layer in self.decoder:
            tgt = layer(tgt, src_mask)  # Only passing source mask to decoder

        return self.fc_out(tgt)


### Initialize the model with the custom transformer architecture

In [118]:
model = CustomTransformer(
    src_vocab_size=5000,  # Example vocab size
    tgt_vocab_size=5000,  # Target vocab size (can be same as source)
    d_model=512,          # Embedding dimension
    num_heads=8,          # Number of attention heads
    num_layers=6,         # Number of layers in encoder/decoder
    d_ff=2048,            # Feed-forward dimension
    max_len=512,          # Maximum sequence length
    dropout=0.1           # Dropout rate
)

# Example to check if everything is working
print(model)


CustomTransformer(
  (encoder_embedding): Embedding(5000, 512)
  (decoder_embedding): Embedding(5000, 512)
  (positional_encoding): PositionalEncoding()
  (encoder): ModuleList(
    (0-5): 6 x CustomEncoderLayer(
      (attention): MultiHeadAttention(
        (q_linear): Linear(in_features=512, out_features=512, bias=True)
        (k_linear): Linear(in_features=512, out_features=512, bias=True)
        (v_linear): Linear(in_features=512, out_features=512, bias=True)
        (fc_out): Linear(in_features=512, out_features=512, bias=True)
        (softmax): Softmax(dim=-1)
      )
      (ffn): FeedForward(
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (relu): ReLU()
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(

In [119]:
# Example input tensors for testing
batch_size = 32
seq_len = 100  # Sequence length
src_vocab_size = 5000  # Example vocab size
tgt_vocab_size = 5000  # Example vocab size

# Create random input tensors to simulate a batch
src = torch.randint(0, src_vocab_size, (batch_size, seq_len))  # Source sequence
tgt = torch.randint(0, tgt_vocab_size, (batch_size, seq_len))  # Target sequence

# Create attention masks (usually 1 for valid tokens, 0 for padding)
src_mask = torch.ones((batch_size, seq_len))  # Source mask (all valid tokens)
tgt_mask = torch.ones((batch_size, seq_len))  # Target mask (all valid tokens)

# Instantiate the model (CustomTransformer)
model = CustomTransformer(
    src_vocab_size=src_vocab_size,
    tgt_vocab_size=tgt_vocab_size,
    d_model=512,
    num_heads=8,
    num_layers=6,
    d_ff=2048,
    max_len=512,
    dropout=0.1
)

# Perform a forward pass to test the model
output = model(src, tgt, src_mask, tgt_mask)

# Print the output shape to verify the dimensions
print("Output shape:", output.shape)
# Expected output shape: (batch_size, tgt_seq_len, tgt_vocab_size)


AttributeError: 'tuple' object has no attribute 'size'