In [None]:
import torch
from torch import nn
import math
from torch.nn import functional as F

In [None]:
class ImputEmbeddings(nn.Module):
  def __init__(self, vocab_size:int, d_model:int) -> None:
    super(ImputEmbeddings, self).__init__()
    self.embed = nn.Embedding(vocab_size, d_model)
    self.d_model = d_model
  def forward(self, x) ->torch.Tensor:
    return self.embed(x) * math.sqrt(self.d_model)

In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model:int, seq_len:int, dropout:float) -> None:
    super(PositionalEncoding, self).__init__()
    self.dropout = nn.Dropout(dropout)
    self.pe = torch.zeros(seq_len, d_model)
    self.sentences = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
    self.div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
    self.pe[:,1::2] = torch.cos(self.sentences / self.div_term)
    self.pe[:,0::2] = torch.sin(self.sentences / self.div_term)
    #Bunu şöyle düşün girdi batch_size, seq_len, d_model kadar olacak pe seq_len, d_model kadar. Bu yüzden buna bir batch ekle
    self.pe = self.pe.unsqueeze(0)
  def forward(self, x) ->torch.Tensor:
    x = x + self.pe[:, :x.size(1), :].detach() #Burada pe parametresi öğrenilmeyen sabit bir ifadedir.
    return self.dropout(x)

In [None]:
class FeedForward(nn.Module):
  def __init__(self, d_model:int, d_ff:int, dropout:float) -> None:
    super(FeedForward, self).__init__()
    self.fc1 = nn.Linear(d_model, d_ff)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(d_ff, d_model)
    self.dropout = nn.Dropout(dropout)
  def forward(self, x) ->torch.Tensor:
    return self.fc2(self.dropout(self.relu(self.fc1(x))))

In [None]:
class AddNormLayer(nn.Module):
  def __init__(self, eps:int = 10**-6) -> None:
    super(AddNormLayer, self).__init__()
    self.alpha = nn.Parameter(torch.tensor(1.0))
    self.beta = nn.Parameter(torch.tensor(1.0))
    self.eps = eps
  def forward(self, x) ->torch.Tensor:
    mean = x.mean(-1, keepdim = True)
    std = x.std(-1, keepdim = True)
    out = self.alpha * (x - mean) / (std + self.eps) + self.beta
    return out

In [None]:
class ResidualConnection(nn.Module):
  def __init__(self, dropout:float) -> None:
    super(ResidualConnection, self).__init__()
    self.dropout = nn.Dropout(dropout)
    self.norm = AddNormLayer()
  def forward(self, x, sublayer):
    return x + self.dropout(self.norm(sublayer))

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model:int, h:int, dropout:float) -> None:
    super(MultiHeadAttention, self).__init__()
    self.d_model = d_model
    self.h = h
    self.d_k = d_model // h
    assert d_model % h == 0, "d_model must be divisible by h"
    self.w_k = nn.Linear(d_model, d_model)
    self.w_q = nn.Linear(d_model, d_model)
    self.w_v = nn.Linear(d_model, d_model)
    self.w_o = nn.Linear(d_model, d_model)
    self.out_linear = nn.Linear(d_model, d_model)
    self.dropout = nn.Dropout(dropout)
  def forward(self, q, k, v, mask) ->torch.Tensor:
    key = self.w_k(k) #(batch_size, seq_len, d_model)
    value = self.w_v(v) #(batch_size, seq_len, d_model)
    query = self.w_q(q) #(batch_size, seq_len, d_model)
    #(batch_size, seq_len, h, d_k) -> (batch_size, h, seq_len, d_k)
    key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2)
    #(batch_size, seq_len, h, d_k) -> (batch_size, h, seq_len, d_k)
    value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2)
    #(batch_size, seq_len, h, d_k) -> (batch_size, h, seq_len, d_k)
    query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2)
    #(batch_size, h, seq_len, d_k) * (batch_size, h, d_k, seq_len) -> (batch_size, h, seq_len, seq_len)
    scores = torch.matmul(query, key.transpose(-2,-1)) / math.sqrt(self.d_k)
    if mask is not None:
      scores = scores.masked_fill(mask == 0, float('-inf'))
    attention_weights = F.softmax(scores)
    #(batch_size, h, seq_len, seq_len) * (batch_size, h, seq_len, d_k) -> (batch_size, h, seq_len, d_k)
    attended = torch.matmul(attention_weights, value)
    #(batch_size, h, seq_len, d_k) -> (batch_size, seq_len, h, d_k) -> (batch_size, seq_len, d_model) ->(batch_size, seq_len, d_model)
    attended = attended.transpose(1, 2).contiguous().view(attended.shape[0], -1, self.d_model)
    output = self.out_linear(attended)
    return output

In [None]:
class Linear(nn.Module):
  def __init__(self, d_model:int, vocab_size:int) -> None:
    super(Linear, self).__init__()
    self.linear = nn.Linear(d_model, vocab_size)
  def forward(self, x) ->torch.Tensor:
    return self.linear(x)

In [None]:
class Softmax(nn.Module):
  def __init__(self, d_model:int, vocab_size:int) -> None:
    super(Softmax, self).__init__()
    self.softmax = nn.Softmax(dim = -1)
  def forward(self, x) ->torch.Tensor:
    return self.softmax(x)

In [None]:
class EncoderBlock(nn.Module):
  def __init__(self, d_model:int, h:int, d_ff:int, dropout:float) -> None:
    super(EncoderBlock, self).__init__()
    self.attention = MultiHeadAttention(d_model, h, dropout)
    self.add_norm = AddNormLayer()
    self.feed_forward = FeedForward(d_model, d_ff, dropout)
    self.add_norm2 = AddNormLayer()
  def forward(self, x, mask) ->torch.Tensor:
    x = self.attention(x, x, x, mask)
    x = self.add_norm(x)
    x = self.feed_forward(x)
    x = self.add_norm2(x)
    return x

In [None]:
class DecoderBlock(nn.Module):
  def __init__(self, d_model:int, h:int, d_ff:int, vocab_size: int, dropout:float) -> None:
    super(DecoderBlock, self).__init__()
    self.masked_attention = MultiHeadAttention(d_model, h, dropout)
    self.add_norm = AddNormLayer()
    self.cross_attention = MultiHeadAttention(d_model, h, dropout)
    self.add_norm2 = AddNormLayer()
    self.feed_forward = FeedForward(d_model, d_ff, dropout)
    self.add_norm3 = AddNormLayer()

  def forward(self, x, encoder_output, src_mask, trg_mask) -> torch.Tensor:
    x = self.masked_attention(x, x, x, trg_mask)
    x = self.add_norm(x)
    x = self.cross_attention(x, encoder_output, encoder_output, src_mask)
    x = self.add_norm2(x)
    x = self.feed_forward(x)
    x = self.add_norm3(x)
    return x

In [None]:
class Transformer(nn.Module):
  def __init__(self, d_model:int, h:int, d_ff:int, vocab_size:int, seq_len:int, dropout:float) -> None:
    super(Transformer, self).__init__()
    self.num_blocks = 6
    self.encoder = EncoderBlock(d_model, h, d_ff, dropout)
    self.decoder = DecoderBlock(d_model, h, d_ff, vocab_size, dropout)
    self.embedding = ImputEmbeddings(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, seq_len, dropout)
    self.encoder_model = nn.ModuleList([EncoderBlock(d_model, h, d_ff, dropout) for _ in range(self.num_blocks)])
    self.decoder_model = nn.ModuleList([DecoderBlock(d_model, h, d_ff, vocab_size, dropout) for _ in range(self.num_blocks)])
    self.linear = Linear(d_model, vocab_size)
    self.softmax = Softmax(d_model, vocab_size)
  def forward(self, src, trg, src_mask, trg_mask) -> torch.Tensor:
    src = self.embedding(src)
    trg = self.embedding(trg)
    src = self.pos_encoding(src)
    trg = self.pos_encoding(trg)
    for layer in self.encoder_model:
      src = layer(src, src_mask)
    for layer in self.decoder_model:
      trg = layer(trg, src, src_mask, trg_mask)
    linear = self.linear(trg)
    softmax = self.softmax(linear)
    return softmax

In [None]:
transformer = Transformer(d_model = 512, h = 8, d_ff = 2048, vocab_size = 100000, seq_len = 10000, dropout = 0.1)
transformer

Transformer(
  (encoder): EncoderBlock(
    (attention): MultiHeadAttention(
      (w_k): Linear(in_features=512, out_features=512, bias=True)
      (w_q): Linear(in_features=512, out_features=512, bias=True)
      (w_v): Linear(in_features=512, out_features=512, bias=True)
      (w_o): Linear(in_features=512, out_features=512, bias=True)
      (out_linear): Linear(in_features=512, out_features=512, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (add_norm): AddNormLayer()
    (feed_forward): FeedForward(
      (fc1): Linear(in_features=512, out_features=2048, bias=True)
      (relu): ReLU()
      (fc2): Linear(in_features=2048, out_features=512, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (add_norm2): AddNormLayer()
  )
  (decoder): DecoderBlock(
    (masked_attention): MultiHeadAttention(
      (w_k): Linear(in_features=512, out_features=512, bias=True)
      (w_q): Linear(in_features=512, out_features=512, bias=True)
      (w_v): Linear(in

In [None]:
# Parametreler
batch_size = 2
seq_len = 10
vocab_size = 1000
d_model = 512
h = 8
d_ff = 2048
dropout = 0.1

# Rastgele input (kelime ID'leri)
src = torch.randint(0, vocab_size, (batch_size, seq_len))  # [batch_size, seq_len]
trg = torch.randint(0, vocab_size, (batch_size, seq_len))  # [batch_size, seq_len]

# Basit mask (şimdilik None ya da sadece 1'ler ile)
src_mask = None
trg_mask = None

# Modeli başlat
model = Transformer(d_model, h, d_ff, vocab_size, seq_len, dropout)

# Modele giriş ver
output = model(src, trg, src_mask, trg_mask)

print("Çıkış boyutu:", output.shape)  # [batch_size, seq_len, vocab_size]


  attention_weights = F.softmax(scores)


Çıkış boyutu: torch.Size([2, 10, 1000])
