In [1]:
import torch
import numpy as np
# Change path
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
os.chdir(os.path.dirname(os.getcwd()))
# # Get the notebook directory
# notebook_dir = os.path.dirname(os.getcwd())
# print(notebook_dir)
# sys.path.append("/Users/pradyut.nair/Documents/University/DL2/spai/spai")
# os.chdir("/Users/pradyut.nair/Documents/University/DL2/spai/spai")
# Go to build.py file
from spai.models.build import build_cls_model  # Change this import
from spai.config import get_config
import torchvision.transforms as transforms

# Create a random input tensor with shape (batch_size, channels, height, width)
batch_size = 2
img_size = 224
input_tensor = torch.randn(batch_size, 3, img_size, img_size)
print(f"Input tensor shape: {input_tensor.shape}")

# Normalize input to [0, 1] range
input_tensor = torch.clamp(input_tensor, min=0., max=1.)

# Load config and create model with semantic fusion enabled
print("Loading configuration...")
config = get_config({"cfg": "configs/spai.yaml"})
config.defrost()
config.MODEL.FRE.USE_SEMANTIC_FUSION = True
config.MODEL_WEIGHTS = "clip"  # Use CLIP backbone for semantic features
config.freeze()

print("\nCreating model...")
model = build_cls_model(config)  # Use build_cls_model instead of build_model

# Print model architecture
print("\nModel Architecture:")
print(model)

# Print model parameters
print("\nModel Parameters:")
print(model.parameters())



Input tensor shape: torch.Size([2, 3, 224, 224])
Loading configuration...
=> merge config from configs/spai.yaml

Creating model...

Model Architecture:
PatchBasedMFViT(
  (mfvit): MFViT(
    (vit): CLIPBackbone(
      (clip): CLIP(
        (visual): VisionTransformer(
          (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
          (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (transformer): Transformer(
            (resblocks): Sequential(
              (0): ResidualAttentionBlock(
                (attn): MultiheadAttention(
                  (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
                )
                (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
                (mlp): Sequential(
                  (c_fc): Linear(in_features=768, out_features=3072, bias=True)
                  (gelu): QuickGELU()
                  (c_proj): Linear(in_features=3072,

In [2]:
x = torch.randn((batch_size, 3, 224, 224))
model.eval()
with torch.no_grad():
    out = model(x)

print(f"Output shape (fixed resolution): {out.shape}")
assert out.shape == (batch_size, 1), f"Unexpected output shape: {out.shape}"


Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shape: torch.Size([2, 196, 768])
Projector input shap

In [3]:
# ... existing code ...
print(f"Output shape (fixed resolution): {out.shape}")
assert out.shape == (batch_size, 1), f"Unexpected output shape: {out.shape}"

# Print semantic fusion parameters
print("\nSemantic Fusion Parameters:")
semantic_fusion = model.mfvit.features_processor.semantic_fusion
for name, param in semantic_fusion.named_parameters():
    print(f"\n{name}:")
    print(f"Shape: {param.shape}")
    print(f"Values: {param.data}")

Output shape (fixed resolution): torch.Size([2, 1])

Semantic Fusion Parameters:

gate.0.weight:
Shape: torch.Size([1024, 1536])
Values: tensor([[ 0.0107,  0.0163, -0.0022,  ...,  0.0003,  0.0308, -0.0242],
        [-0.0134, -0.0118, -0.0066,  ...,  0.0078,  0.0050, -0.0368],
        [ 0.0347,  0.0403,  0.0004,  ..., -0.0207,  0.0086,  0.0022],
        ...,
        [-0.0092, -0.0209, -0.0061,  ..., -0.0088, -0.0093, -0.0051],
        [ 0.0094,  0.0019,  0.0273,  ..., -0.0093, -0.0251, -0.0036],
        [-0.0097,  0.0006, -0.0102,  ...,  0.0085, -0.0444, -0.0202]])

gate.0.bias:
Shape: torch.Size([1024])
Values: tensor([0., 0., 0.,  ..., 0., 0., 0.])
