In [84]:
import torch
from torch import nn

class SiglipVisionEmbeddings(nn.Module):
    def __init__(self, in_channels: int = 3, patch_size: int = 16, embed_dim: int = 768, image_size: int = 224):
        super().__init__()
        self.patch_embedding = nn.Conv2d(
            in_channels=in_channels,
            out_channels=embed_dim,
            kernel_size=patch_size,
            stride=patch_size,
            padding=0
        )

        num_patches = (image_size // patch_size) ** 2
        self.position_embedding = nn.Embedding(num_embeddings=num_patches, embedding_dim=embed_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
      # (batch_size, 3, 224, 224) -> (batch_size, 768, 14, 14)
        x = self.patch_embedding(x)
        # -> (batch_size, 768, 196) -> (batch_size, 196, 768)
        x = x.flatten(2).transpose(1, 2)

        # Pozisyon embedding ekle
        positions = torch.arange(x.size(1), device=x.device)
        position_embeddings = self.position_embedding(positions)
        x = x + position_embeddings
        return x


SigLIP modeli CLIP modelinin gelişmiş halidir. Bu modeller metin ve görselleri aynı temsil uzayında birleştirerek çoklu modellerin önünü açtı. Ancak CLIP modelinin bazı zayıf yönleri vardı. 

Contrastive loss (karşılaştırmalı kayıp) yapısı, yalnızca “pozitif” ve “negatif” örnekler arasındaki farkı öğreniyordu. Bu, küçük batch boyutlarında zayıf performans ve dengesiz gradyan akışı gibi sorunlara yol açıyordu.

Google Research, bu eksikleri düzeltmek için SigLIP (Sigmoid Loss for Language-Image Pretraining) adlı modeli tanıttı.


SigLIP, CLIP ile aynı mimari temele sahiptir. Bir görsel encoder (ViT veya benzeri) ve bir text encoder. Fakat asıl fark öğrenme fonksiyonundadır. CLIP, görüntü ve metin embedding’lerini normalize edip, tüm batch boyunca bir contrastive cross-entropy loss hesaplar. SigLIP ise sigmoid tabanlı bir binary loss kullanır. Bu değişiklik, modelin davranışında büyük fark yaratır. Yani SigLIP, CLIP’in “karşılaştırmalı” düşünme biçimini bırakıp her görsel-metin çiftini ayrı bir ikili sınıflandırma problemi olarak ele alır.

Gelin bu mimarideki görsel encoder'ı ele alalım. Bunun için öncelikli olarak orijinal modeli sisteme kuralım.

In [9]:
from transformers import AutoProcessor, SiglipVisionModel, SiglipVisionConfig
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
vision_model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224", config = SiglipVisionConfig(vision_use_head=True))
vision_model

SiglipVisionModel(
  (vision_model): SiglipVisionTransformer(
    (embeddings): SiglipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), padding=valid)
      (position_embedding): Embedding(196, 768)
    )
    (encoder): SiglipEncoder(
      (layers): ModuleList(
        (0-11): 12 x SiglipEncoderLayer(
          (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (self_attn): SiglipAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (mlp): SiglipMLP(
            (activation_fn): GELUTanh()
            (fc1): Linear(in_features=768, out_features=3072, bias=Tr

Görüldüğü üzere MLP yapısı klasik LLM modellerine benzemektedir. Aynı şekilde Attention yapısı da klasik mimariye benzemektedir. Bu yaklaşımları ve mimariyi incelemek için GPT2, BERT, Gemma modelleri ile ilgili oluşturduğum notebookları inceleyebilirsiniz.

In [85]:
class SiglipAttention(nn.Module):
  def __init__(self, embed_dim:int = 768, n_heads:int=8) -> None:
    super().__init__()
    assert embed_dim % n_heads == 0, "embed_dim, n_heads'e tam bölünmelidir..."
    self.d_head = embed_dim // n_heads
    self.n_heads = n_heads
    self.embed_dim = embed_dim

    self.q_proj = nn.Linear(self.embed_dim, self.n_heads * self.d_head, bias = True)
    self.k_proj = nn.Linear(self.embed_dim, self.n_heads * self.d_head, bias = True)
    self.v_proj = nn.Linear(self.embed_dim, self.n_heads * self.d_head, bias = True)
    self.out_proj = nn.Linear(self.n_heads * self.d_head, self.embed_dim, bias = True)

  def forward(self, x:torch.Tensor) -> torch.Tensor:
    batch_size, seq_len, dim = x.shape
    # (batch_size, seq_len, dim) -> (batch_size, seq_len, n_heads, d_head) -> (batch_size, n_heads, seq_len, d_head)
    q = self.q_proj(x).transpose(1, 2)
    k = self.k_proj(x).transpose(1, 2)
    v = self.v_proj(x).transpose(1, 2)
    # (batch_size, n_heads, seq_len, d_head) @ (batch_size, n_heads, d_head, seq_len) -> (batch_size, n_heads, seq_len, seq_len)
    attn_scores = q @ k.transpose(-1, -2)
    attn_scores = attn_scores / (self.d_head ** 0.5)
    attn_probs = torch.softmax(attn_scores, dim=-1)
    # (batch_size, n_heads, seq_len, seq_len) @ (batch_size, n_heads, seq_len, d_head) -> (batch_size, n_heads, seq_len, d_head)
    attn_output = attn_probs @ v
    # (batch_size, n_heads, seq_len, d_head) -> (batch_size, seq_len, n_heads, d_head) -> (batch_size, seq_len, n_heads * d_head)
    attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, self.n_heads * self.d_head)
    # (batch_size, seq_len, n_heads * d_head) -> (batch_size, seq_len, dim)
    out_proj = self.out_proj(attn_output)
    return out_proj

In [86]:
class GELUTanh(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return 0.5 * x * (1.0 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) *
                                            (x + 0.044715 * torch.pow(x, 3))))

class SiglipMLP(nn.Module):
    def __init__(self, embed_dim: int = 768, hidden_dim: int = 3072) -> None:
        super().__init__()
        self.activation_fn = GELUTanh()
        self.fc1 = nn.Linear(embed_dim, hidden_dim, bias = True)
        self.fc2 = nn.Linear(hidden_dim, embed_dim, bias = True)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.fc1(x)
        x = self.activation_fn(x)
        x = self.fc2(x)
        return x

In [87]:
class SiglipEncoderLayer(nn.Module):
  def __init__(self, embed_dim:int = 768, n_heads:int = 8, n_layers:int=12) -> None:
    super().__init__()
    self.layer_norm1 = nn.LayerNorm(embed_dim, eps = 1e-06, elementwise_affine= True)
    self.self_attn = SiglipAttention(embed_dim, n_heads)
    self.layer_norm2 = nn.LayerNorm(embed_dim, eps = 1e-06, elementwise_affine= True)
    self.mlp = SiglipMLP(embed_dim)

  def forward(self, x:torch.Tensor) -> torch.Tensor:
    residual = x
    x = self.layer_norm1(x)
    x = self.self_attn(x)
    residual = x
    x = x + residual
    x = self.layer_norm2(x)
    x = self.mlp(x)
    return x

In [88]:
class SiglipEncoder(nn.Module):
  def __init__(self, embed_dim: int = 768, n_heads: int = 8, hidden_dim: int = 3072, n_layers: int = 12) -> None:
    super().__init__()
    self.layers = nn.ModuleList([
      SiglipEncoderLayer(embed_dim, n_heads, hidden_dim) for _ in range(n_layers)
    ])

  def forward(self, x: torch.Tensor) -> torch.Tensor:
    for layer in self.layers:
      x = layer(x)
    return x


In [89]:
class SiglipMultiHeadAttentionPoolingHead(nn.Module):
  def __init__(self, embed_dim:int =768, n_heads:int = 8, hidden_dim:int = 3072) -> None:
    super().__init__()
    self.attention = nn.MultiheadAttention(embed_dim = embed_dim, num_heads = n_heads, bias = True)
    self.out_proj = nn.Linear(embed_dim, embed_dim, bias = True)
    self.layernorm = nn.LayerNorm(embed_dim, eps = 1e-06, elementwise_affine= True)
    self.mlp = SiglipMLP(embed_dim, hidden_dim)
  def forward(self, x:torch.Tensor) -> torch.Tensor:
    x = self.attention(x, x, x)[0]
    x = self.out_proj(x)
    x = self.layernorm(x)
    x = self.mlp(x)
    return x

In [90]:
class SiglipVisionTransformer(nn.Module):
  def __init__(self, embed_dim:int = 768, n_heads:int = 8, n_layers:int = 12, hidden_dim:int = 3072) -> None:
    super().__init__()
    self.embeddings = SiglipVisionEmbeddings()
    self.encoder = SiglipEncoder(embed_dim, n_heads, hidden_dim, n_layers)
    self.post_layernorm = nn.LayerNorm(embed_dim, eps = 1e-06, elementwise_affine= True)
    self.head = SiglipMultiHeadAttentionPoolingHead(embed_dim, n_heads, hidden_dim)
  def forward(self, x:torch.Tensor) -> torch.Tensor:
    x = self.embeddings(x)
    x = self.encoder(x)
    x = self.post_layernorm(x)
    x = self.head(x)
    return x

In [91]:
class SiglipVisionModel(nn.Module):
  def __init__(self, embed_dim:int = 768, n_heads:int = 8, n_layers:int = 12, hidden_dim:int = 3072) -> None:
    super().__init__()
    self.vision_model = SiglipVisionTransformer(embed_dim, n_heads, n_layers, hidden_dim)
  def forward(self, x:torch.Tensor) -> torch.Tensor:
    x = self.vision_model(x)
    return x

In [92]:
model = SiglipVisionModel()
model

SiglipVisionModel(
  (vision_model): SiglipVisionTransformer(
    (embeddings): SiglipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      (position_embedding): Embedding(196, 768)
    )
    (encoder): SiglipEncoder(
      (layers): ModuleList(
        (0-11): 12 x SiglipEncoderLayer(
          (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (self_attn): SiglipAttention(
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
          (mlp): SiglipMLP(
            (activation_fn): GELUTanh()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
           

In [93]:
rand_tensor = torch.rand(1, 3, 224, 224)
model(rand_tensor)

tensor([[[-0.0449, -0.0932, -0.2317,  ..., -0.1659, -0.3987, -0.3212],
         [-0.0445, -0.0932, -0.2318,  ..., -0.1658, -0.3991, -0.3215],
         [-0.0450, -0.0933, -0.2316,  ..., -0.1659, -0.3985, -0.3212],
         ...,
         [-0.0443, -0.0928, -0.2317,  ..., -0.1657, -0.3995, -0.3214],
         [-0.0441, -0.0926, -0.2318,  ..., -0.1657, -0.3999, -0.3215],
         [-0.0448, -0.0935, -0.2318,  ..., -0.1659, -0.3986, -0.3214]]],
       grad_fn=<ViewBackward0>)

In [94]:
model(rand_tensor).shape

torch.Size([1, 196, 768])

In [95]:
vision_model(rand_tensor)

BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 0.5459,  2.4697,  0.0855,  ..., -0.9363,  1.1687,  0.2339],
         [-0.6831,  1.3061, -3.3035,  ..., -0.0148,  0.4427, -0.1520],
         [-0.9031, -0.6246,  1.1616,  ..., -0.7311,  0.5455,  0.8208],
         ...,
         [-0.8822, -0.9972, -2.1969,  ...,  0.8250,  0.0052, -0.5779],
         [ 0.4814,  1.7879, -0.2659,  ..., -1.5753,  1.8970, -1.1469],
         [-0.3724,  0.0985,  0.8736,  ..., -1.8435,  0.4599,  0.9571]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-1.8030e-01, -1.5814e-01, -8.7938e-03, -1.2028e-01, -1.2303e-02,
          1.0454e-01, -9.6347e-01, -4.1511e-01,  6.9938e-02,  3.9612e-01,
          3.0890e-01,  6.0258e-03,  1.6329e-01, -2.3422e-01,  2.6406e-01,
         -4.4966e-01, -2.6904e-01, -1.2774e-01,  6.9778e-01, -1.9921e-01,
         -4.1621e-02, -2.1980e-01, -2.2761e-01, -3.1168e-01,  8.9810e-03,
          3.8619e-02,  3.4041e-01,  2.2537e-01,  2.7722e-01,  3.7360e-01,
         -2

In [97]:
output = vision_model(rand_tensor)
print(output.last_hidden_state.shape)
print(output.pooler_output.shape)


torch.Size([1, 196, 768])
torch.Size([1, 768])
