In [21]:
import torch
from transformers import AutoImageProcessor, CvtModel, CvtConfig
from PIL import Image

# Open the image from the file path
image = Image.open("/Users/jimmyhe/Desktop/KaggleCompetitions/ISISCANCER/MetaDataPlusProprocessed/ExmaplerPic.png")
# Load a ViT model and image processor
model = CvtModel.from_pretrained('microsoft/cvt-13', output_hidden_states=True)
processor = AutoImageProcessor.from_pretrained('microsoft/cvt-13')

# Assuming image is loaded (e.g., using PIL or another method)
inputs = processor(images=image, return_tensors="pt")

# Forward pass through the model to get the hidden states
with torch.no_grad():
    outputs = model(**inputs)

# The model outputs a tuple where `outputs.hidden_states` contains hidden states
hidden_states = outputs.hidden_states

# Each hidden state corresponds to the output from one layer, including the embeddings
print(f"Number of layers: {len(hidden_states)}")  # Should be 13 (12 transformer layers + embedding layer)

# Shape of the hidden states: [batch_size, num_tokens, hidden_size]
# num_tokens = num_patches + 1 (because of the [CLS] token)
for i, hidden_state in enumerate(hidden_states):
    print(f"Layer {i} hidden state shape: {hidden_state.shape}")


Number of layers: 3
Layer 0 hidden state shape: torch.Size([1, 64, 56, 56])
Layer 1 hidden state shape: torch.Size([1, 192, 28, 28])
Layer 2 hidden state shape: torch.Size([1, 384, 14, 14])


In [31]:
image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
model = CvtModel.from_pretrained("microsoft/cvt-13")

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

[1, 384, 14, 14]

In [32]:
import torch.nn as nn
avg_pool = nn.AdaptiveAvgPool2d((1, 1)) 
cls_embedding = avg_pool(last_hidden_states).squeeze()

print(cls_embedding.shape)

torch.Size([384])


In [27]:
import torch
from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image

# Open the image from the file path
image = Image.open("/Users/jimmyhe/Desktop/KaggleCompetitions/ISISCANCER/MetaDataPlusProprocessed/ExmaplerPic.png")

# Load a ViT model and image processor
google_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
google = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224", output_hidden_states=True)  # Enable hidden states

# Assuming image is loaded (e.g., using PIL or another method)
inputs = google_processor(images=image, return_tensors="pt")

# Forward pass through the model to get the hidden states
with torch.no_grad():
    outputs = google(**inputs)

# Now the model outputs should contain hidden states since we enabled it
hidden_states = outputs.hidden_states

# Each hidden state corresponds to the output from one layer, including the embeddings
print(f"Number of layers: {len(hidden_states)}")  # Should be 13 (12 transformer layers + embedding layer)

# Shape of the hidden states: [batch_size, num_tokens, hidden_size]
# num_tokens = num_patches + 1 (because of the [CLS] token)
for i, hidden_state in enumerate(hidden_states):
    print(f"Layer {i} hidden state shape: {hidden_state.shape}")



Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


Number of layers: 13
Layer 0 hidden state shape: torch.Size([1, 197, 768])
Layer 1 hidden state shape: torch.Size([1, 197, 768])
Layer 2 hidden state shape: torch.Size([1, 197, 768])
Layer 3 hidden state shape: torch.Size([1, 197, 768])
Layer 4 hidden state shape: torch.Size([1, 197, 768])
Layer 5 hidden state shape: torch.Size([1, 197, 768])
Layer 6 hidden state shape: torch.Size([1, 197, 768])
Layer 7 hidden state shape: torch.Size([1, 197, 768])
Layer 8 hidden state shape: torch.Size([1, 197, 768])
Layer 9 hidden state shape: torch.Size([1, 197, 768])
Layer 10 hidden state shape: torch.Size([1, 197, 768])
Layer 11 hidden state shape: torch.Size([1, 197, 768])
Layer 12 hidden state shape: torch.Size([1, 197, 768])


In [24]:
configuration = CvtConfig()


AttributeError: 'BaseModelOutputWithCLSToken' object has no attribute 'pooler_output'

In [28]:
cls_embedding = hidden_states[-1][:, 0, :]  # Shape: [batch_size, hidden_size]

# Mean pooling across all patch embeddings (excluding the CLS token)
patch_embeddings = hidden_states[-1][:, 1:, :]  # Exclude CLS token
mean_pooled_embedding = torch.mean(patch_embeddings, dim=1)  # Shape: [batch_size, hidden_size]

print("CLS token embedding shape:", cls_embedding.shape)
print("Mean pooled embedding shape:", mean_pooled_embedding.shape)

CLS token embedding shape: torch.Size([1, 768])
Mean pooled embedding shape: torch.Size([1, 768])


In [15]:
print(model)


CvtForImageClassification(
  (cvt): CvtModel(
    (encoder): CvtEncoder(
      (stages): ModuleList(
        (0): CvtStage(
          (embedding): CvtEmbeddings(
            (convolution_embeddings): CvtConvEmbeddings(
              (projection): Conv2d(3, 64, kernel_size=(7, 7), stride=(4, 4), padding=(2, 2))
              (normalization): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
            )
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (layers): Sequential(
            (0): CvtLayer(
              (attention): CvtAttention(
                (attention): CvtSelfAttention(
                  (convolution_projection_query): CvtSelfAttentionProjection(
                    (convolution_projection): CvtSelfAttentionConvProjection(
                      (convolution): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=64, bias=False)
                      (normalization): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, tra