# Exploratory Analysis on the Semantic Space of TinyCLIP

We want to see the semantic space of TinyCLIP using ablations.

In [None]:
import vit_prisma 

### Import model

We're just using the visual encoder of TinyCLIP. The visual encoder is a transformer with 10 layers, 12 attention heads, and 256 hidden representation dimensionality.

In [32]:

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("wkcn/TinyCLIP-ViT-40M-32-Text-19M-LAION400M")
processor = CLIPProcessor.from_pretrained("wkcn/TinyCLIP-ViT-40M-32-Text-19M-LAION400M")


In [38]:
model.vision_model

CLIPVisionTransformer(
  (embeddings): CLIPVisionEmbeddings(
    (patch_embedding): Conv2d(3, 512, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (position_embedding): Embedding(50, 512)
  )
  (pre_layrnorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (encoder): CLIPEncoder(
    (layers): ModuleList(
      (0-11): 12 x CLIPEncoderLayer(
        (self_attn): CLIPAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): CLIPMLP(
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
        )
   

In [36]:

from transformers import ViTForImageClassification

for k in model.vision_model.state_dict().keys():
    print(k)

embeddings.class_embedding
embeddings.patch_embedding.weight
embeddings.position_embedding.weight
pre_layrnorm.weight
pre_layrnorm.bias
encoder.layers.0.self_attn.k_proj.weight
encoder.layers.0.self_attn.k_proj.bias
encoder.layers.0.self_attn.v_proj.weight
encoder.layers.0.self_attn.v_proj.bias
encoder.layers.0.self_attn.q_proj.weight
encoder.layers.0.self_attn.q_proj.bias
encoder.layers.0.self_attn.out_proj.weight
encoder.layers.0.self_attn.out_proj.bias
encoder.layers.0.layer_norm1.weight
encoder.layers.0.layer_norm1.bias
encoder.layers.0.mlp.fc1.weight
encoder.layers.0.mlp.fc1.bias
encoder.layers.0.mlp.fc2.weight
encoder.layers.0.mlp.fc2.bias
encoder.layers.0.layer_norm2.weight
encoder.layers.0.layer_norm2.bias
encoder.layers.1.self_attn.k_proj.weight
encoder.layers.1.self_attn.k_proj.bias
encoder.layers.1.self_attn.v_proj.weight
encoder.layers.1.self_attn.v_proj.bias
encoder.layers.1.self_attn.q_proj.weight
encoder.layers.1.self_attn.q_proj.bias
encoder.layers.1.self_attn.out_proj.

In [23]:
hf_model = ViTForImageClassification.from_pretrained(
                    model_name,
            )

hf_model

You are using a model of type clip to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at wkcn/TinyCLIP-ViT-40M-32-Text-19M-LAION400M and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.cls_token', 'embeddings.patch_embeddings.projection.bias', 'embeddings.patch_embeddings.projection.weight', 'embeddings.position_embeddings', 'encoder.layer.0.attention.attention.key.bias', 'encoder.layer.0.attention.attention.key.weight', 'encoder.layer.0.attention.attention.query.bias', 'encoder.layer.0.attention.attention.query.weight', 'encoder.layer.0.attention.attention.value.bias', 'encoder.layer.0.attention.attention.value.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

In [22]:
for name, i in hf_model.named_parameters():
    print(name, i.shape)

vit.embeddings.cls_token torch.Size([1, 1, 768])
vit.embeddings.position_embeddings torch.Size([1, 197, 768])
vit.embeddings.patch_embeddings.projection.weight torch.Size([768, 3, 16, 16])
vit.embeddings.patch_embeddings.projection.bias torch.Size([768])
vit.encoder.layer.0.attention.attention.query.weight torch.Size([768, 768])
vit.encoder.layer.0.attention.attention.query.bias torch.Size([768])
vit.encoder.layer.0.attention.attention.key.weight torch.Size([768, 768])
vit.encoder.layer.0.attention.attention.key.bias torch.Size([768])
vit.encoder.layer.0.attention.attention.value.weight torch.Size([768, 768])
vit.encoder.layer.0.attention.attention.value.bias torch.Size([768])
vit.encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
vit.encoder.layer.0.attention.output.dense.bias torch.Size([768])
vit.encoder.layer.0.intermediate.dense.weight torch.Size([3072, 768])
vit.encoder.layer.0.intermediate.dense.bias torch.Size([3072])
vit.encoder.layer.0.output.dense.weight to

In [3]:
from vit_prisma.configs import HookedViTConfig
from vit_prisma.models.base_vit import HookedViT

'tinyclip'

prisma_model = HookedViT.from_pretrained("wkcn/TinyCLIP-ViT-40M-32-Text-19M-LAION400M", is_timm=False, is_clip=True)

You are using a model of type clip to instantiate a model of type vit. This is not supported for all configurations of models and can yield errors.


hf_config CLIPVisionConfig {
  "architecture": "vit_clip_vision_encoder",
  "attention_dropout": 0.0,
  "dropout": 0.0,
  "hidden_act": "gelu",
  "hidden_size": 512,
  "image_size": 224,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "layer_norm_eps": 1e-05,
  "model_type": "clip_vision_model",
  "num_attention_heads": 8,
  "num_channels": 3,
  "num_classes": "n/a",
  "num_hidden_layers": 12,
  "patch_size": 32,
  "projection_dim": 512,
  "transformers_version": "4.37.2"
}



Some weights of ViTForImageClassification were not initialized from the model checkpoint at wkcn/TinyCLIP-ViT-40M-32-Text-19M-LAION400M and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.cls_token', 'embeddings.patch_embeddings.projection.bias', 'embeddings.patch_embeddings.projection.weight', 'embeddings.position_embeddings', 'encoder.layer.0.attention.attention.key.bias', 'encoder.layer.0.attention.attention.key.weight', 'encoder.layer.0.attention.attention.query.bias', 'encoder.layer.0.attention.attention.query.weight', 'encoder.layer.0.attention.attention.value.bias', 'encoder.layer.0.attention.attention.value.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.layernorm_after.bias', 'encoder.layer.0.layernorm_after.weight', 'encoder.layer.0.layernorm_before.bias', 'encoder.layer.0.layernorm_before

ValueError: Loading weights from the architecture is not currently supported: vit_clip_vision_encoder, generated from model name . Feel free to open an issue on GitHub to request this feature.

In [None]:
# Import model
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

# model = CLIPModel.from_pretrained("wkcn/TinyCLIP-ViT-40M-32-Text-19M-LAION400M")
# processor = CLIPProcessor.from_pretrained("wkcn/TinyCLIP-ViT-40M-32-Text-19M-LAION400M")

# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)

# inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

# outputs = model(**inputs)
# logits_per_image = outputs.logits_per_image # this is the image-text similarity score
# probs = logits_per_image.softmax(dim=1)

### Import CIFAR-100

The labels of CIFAR-100 have two levels of granularity: coarse-grained and fine-grained labels. 

We want to see the relationship between the coarse-grained and fine-grained labels inside the net. For example, perhaps the coarse-grained labels tend to be identified around Layer 5, while the fine-grained labels tend to be identified around Label 6. Perhaps there is a semantic hierarchy reflected in a TinyCLIP circuit.