# Positional Representation Variants for LLMs

## Overview

Positional encodings are crucial for transformers to understand sequence order. This notebook covers:

- **RoPE (Rotary Position Embedding)**: Rotation-based position encoding
- **ALiBi (Attention with Linear Biases)**: Bias-based positional information
- **Relative Position Encodings**: Distance-based position representations
- **Performance Comparison**: Evaluating different positional schemes

Let's implement and compare various positional encoding methods.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import math
from typing import Optional, Tuple

print("Libraries imported successfully!")

## 1. RoPE (Rotary Position Embedding)

RoPE encodes position by rotating query and key vectors:

In [None]:
class RoPEAttention(nn.Module):
    """Multi-head attention with Rotary Position Embedding"""
    
    def __init__(self, d_model, n_heads, max_seq_len=2048):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_head = d_model // n_heads
        self.max_seq_len = max_seq_len
        
        self.qkv_proj = nn.Linear(d_model, 3 * d_model)
        self.out_proj = nn.Linear(d_model, d_model)
        
        # Precompute rotation matrices
        self.register_buffer('cos_cached', None)
        self.register_buffer('sin_cached', None)
        self._build_rope_cache(max_seq_len)
    
    def _build_rope_cache(self, seq_len):
        """Build rotation matrices for RoPE"""
        # Frequency for each dimension pair
        inv_freq = 1.0 / (10000 ** (torch.arange(0, self.d_head, 2).float() / self.d_head))
        
        # Position indices
        t = torch.arange(seq_len).type_as(inv_freq)
        
        # Compute frequencies
        freqs = torch.einsum('i,j->ij', t, inv_freq)
        
        # Create rotation matrices
        cos = freqs.cos()
        sin = freqs.sin()
        
        self.cos_cached = cos
        self.sin_cached = sin
    
    def apply_rope(self, x, cos, sin):
        """Apply rotary position embedding"""
        # x shape: [batch, heads, seq_len, head_dim]
        seq_len = x.shape[2]
        
        # Get rotation matrices for current sequence length
        cos = cos[:seq_len, :].unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, head_dim//2]
        sin = sin[:seq_len, :].unsqueeze(0).unsqueeze(0)
        
        # Split into pairs for rotation
        x1 = x[..., ::2]   # Even indices
        x2 = x[..., 1::2]  # Odd indices
        
        # Apply rotation
        rotated_x1 = x1 * cos - x2 * sin
        rotated_x2 = x1 * sin + x2 * cos
        
        # Interleave back
        rotated_x = torch.stack([rotated_x1, rotated_x2], dim=-1)
        rotated_x = rotated_x.flatten(-2)
        
        return rotated_x
    
    def forward(self, x):
        B, T, C = x.shape
        
        # Extend cache if needed
        if T > self.cos_cached.shape[0]:
            self._build_rope_cache(T)
        
        # Project to Q, K, V
        qkv = self.qkv_proj(x)
        q, k, v = qkv.chunk(3, dim=-1)
        
        # Reshape for multi-head attention
        q = q.view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        k = k.view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        v = v.view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        
        # Apply RoPE to queries and keys
        q = self.apply_rope(q, self.cos_cached, self.sin_cached)
        k = self.apply_rope(k, self.cos_cached, self.sin_cached)
        
        # Compute attention
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_head)
        attn = F.softmax(scores, dim=-1)
        out = torch.matmul(attn, v)
        
        # Reshape and project output
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        return self.out_proj(out)

class ALiBiAttention(nn.Module):
    """Attention with Linear Biases (ALiBi)"""
    
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_head = d_model // n_heads
        
        self.qkv_proj = nn.Linear(d_model, 3 * d_model)
        self.out_proj = nn.Linear(d_model, d_model)
        
        # ALiBi slopes
        self.register_buffer('slopes', self._get_alibi_slopes(n_heads))
    
    def _get_alibi_slopes(self, n_heads):
        """Compute ALiBi slopes for each head"""
        def get_slopes_power_of_2(n):
            start = (2**(-2**-(math.log2(n)-3)))
            ratio = start
            return [start*ratio**i for i in range(n)]
        
        if math.log2(n_heads).is_integer():
            slopes = get_slopes_power_of_2(n_heads)
        else:
            closest_power_of_2 = 2**math.floor(math.log2(n_heads))
            slopes = get_slopes_power_of_2(closest_power_of_2)
            slopes.extend(get_slopes_power_of_2(2*closest_power_of_2)[0::2][:n_heads-closest_power_of_2])
        
        return torch.tensor(slopes).float()
    
    def _get_alibi_bias(self, seq_len):
        """Generate ALiBi bias matrix"""
        # Create distance matrix
        context_position = torch.arange(seq_len)[:, None]
        memory_position = torch.arange(seq_len)[None, :]
        relative_position = memory_position - context_position
        
        # Apply slopes to create bias
        bias = relative_position[None, :, :] * self.slopes[:, None, None]
        
        return bias
    
    def forward(self, x):
        B, T, C = x.shape
        
        # Project to Q, K, V
        qkv = self.qkv_proj(x)
        q, k, v = qkv.chunk(3, dim=-1)
        
        # Reshape for multi-head attention
        q = q.view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        k = k.view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        v = v.view(B, T, self.n_heads, self.d_head).transpose(1, 2)
        
        # Compute attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_head)
        
        # Add ALiBi bias
        alibi_bias = self._get_alibi_bias(T).to(scores.device)
        scores = scores + alibi_bias
        
        # Apply attention
        attn = F.softmax(scores, dim=-1)
        out = torch.matmul(attn, v)
        
        # Reshape and project output
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        return self.out_proj(out)

print("Positional encoding modules implemented!")