In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import numpy

In [8]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=5000):
    super(PositionalEncoding, self).__init__()
    pe = torch.zeros(max_len, d_model)
    posi = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    pe[:, 0::2] = torch.sin(posi * div_term)
    pe[:, 1::2] = torch.cos(posi * div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

  def forward(self,x):
    x = x + self.pe[:, :x.size(1)]
    return x


class MultiHeadSelfAttention(nn.Module):
  def __init__(self, d_model, num_heads, dropout=0.1):
    super().__init__()
    assert d_model % num_heads == 0
    self.num_heads = num_heads
    self.d_k = d_model // num_heads

    self.query = nn.Linear(d_model, d_model)
    self.key   = nn.Linear(d_model, d_model)
    self.value = nn.Linear(d_model, d_model)
    self.out   = nn.Linear(d_model, d_model)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    batch_size, seq_length, d_model = x.size()
    Q = self.query(x).view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1,2)
    K = self.key(x).view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1,2)
    V = self.value(x).view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1,2)

    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

    mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool().to(x.device)
    scores = scores.masked_fill(mask, float('-inf'))

    attn = F.softmax(scores, dim=-1)
    attn = self.dropout(attn)
    context = torch.matmul(attn, V)
    context = context.transpose(1, 2).contiguous().view(batch_size, seq_length, d_model)
    return self.out(context)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)


class transformerblock(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadSelfAttention(d_model, num_heads, dropout)
        self.ln1  = nn.LayerNorm(d_model)
        self.ff   = FeedForward(d_model, d_ff, dropout)
        self.ln2  = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
  def forward(self, x):
        attn_out = self.attn(x)
        x = self.ln1(x + self.dropout(attn_out))
        ff_out = self.ff(x)
        x = self.ln2(x + self.dropout(ff_out))
        return x


In [11]:
class GPT(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, max_len, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)
        self.blocks = nn.ModuleList(
            [transformerblock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]
        )
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size)
        self._init_weights()

    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, mean=0, std=0.02)

    def forward(self, x):
        x = self.token_embedding(x)
        x = self.positional_encoding(x)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits

In [12]:
if __name__ == '__main__':
    #hyperparameters
    vocab_size = 10000   # Size of your vocabulary
    d_model = 128        # Embedding/hidden dimension
    num_layers = 4       # Number of Transformer blocks
    num_heads = 4        # Number of attention heads
    d_ff = 512           # Dimension of the feed-forward network
    max_len = 128        # Maximum sequence length
    dropout = 0.1        # Dropout rate

    # Initializing the model
    model = GPT(vocab_size, d_model, num_layers, num_heads, d_ff, max_len, dropout)
    print(model)

GPT(
  (token_embedding): Embedding(10000, 128)
  (positional_encoding): PositionalEncoding()
  (blocks): ModuleList(
    (0-3): 4 x transformerblock(
      (attn): MultiHeadSelfAttention(
        (query): Linear(in_features=128, out_features=128, bias=True)
        (key): Linear(in_features=128, out_features=128, bias=True)
        (value): Linear(in_features=128, out_features=128, bias=True)
        (out): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ff): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=512, out_features=128, bias=True)
          (4): Dropout(p=0.1, inplace=False)
        )
      )
      (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (d