In [2]:
import torch
class GELU(torch.nn.Module):
    def __init__(self):
        super(GELU, self).__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

In [5]:
import torch
import torch.nn as nn
import math

# Custom GELU activation function
class GELU(nn.Module):
    def __init__(self):
        super(GELU, self).__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

# Patch Embedding layer used in Vision Transformer
class PatchEmbed(nn.Module):
    def __init__(self, img_size=256, patch_size=8, in_chans=3, embed_dim=384):
        super().__init__()
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x):
        x = self.proj(x)
        x = x.flatten(2)
        x = x.transpose(1, 2)
        return x

# Attention mechanism used in Vision Transformer
class Attention(nn.Module):
    def __init__(self, dim, num_heads=12, qkv_bias=True, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = head_dim ** -0.5
        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]
        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)
        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

# MLP block used in Vision Transformer
class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x

# Vision Transformer Block
class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0., act_layer=GELU, norm_layer=nn.LayerNorm):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
        self.drop_path = nn.Identity()  # Replace with actual DropPath if needed
        self.norm2 = norm_layer(dim)
        self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)

    def forward(self, x):
        x = x + self.drop_path(self.attn(self.norm1(x)))
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x

# Defining the full Vision Transformer model
class VisionTransformer(nn.Module):
    def __init__(self, img_size=256, patch_size=8, in_chans=3, num_classes=1000, embed_dim=384, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0.):
        super().__init__()
        self.patch_embed = PatchEmbed(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
        self.pos_drop = nn.Dropout(p=drop_rate)
        self.blocks = nn.ModuleList([Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate, attn_drop=attn_drop_rate) for _ in range(depth)])
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Identity()  # Typically a classifier head here

    def forward(self, x):
        x = self.patch_embed(x)
        x = self.pos_drop(x)
        for block in self.blocks:
            x = block(x)
        x = self.norm(x)
        x = self.head(x)
        return x

# Create an instance of the model
model = VisionTransformer()

# Path to save the model
model_path = 'my_VisionTransformer.pth'
torch.save(model.state_dict(), model_path)

print(f"Model saved successfully at {model_path}")


Model saved successfully at my_VisionTransformer.pth


In [7]:
my_model=torch.load(model_path)

In [9]:
import numpy as np
import torch
import os
from vision_transformers.vision_transformers import VitGenerator
from vision_transformers.preprocess import Loader,visualize_predict


In [10]:
device=torch.device("cpu")

In [11]:
# set some variables
name_model = 'vit_small'
patch_size = 8

In [12]:
import torch
my_model=torch.load("my_model.pth")
list(my_model)

RuntimeError: my_model.pth is a zip archive (did you mean to use torch.jit.load()?)

In [13]:
model = VitGenerator(my_model, patch_size,
                     device, evaluate=True, random=False, verbose=True)

[INFO] Initializing OrderedDict([('patch_embed.proj.weight', tensor([[[[ 3.5189e-02,  4.7809e-02,  5.7065e-02,  ...,  1.4718e-02,
            5.3021e-02, -4.0045e-02],
          [-2.4822e-02, -1.6856e-03, -6.4882e-02,  ..., -7.1504e-02,
            5.4969e-02,  3.9102e-02],
          [-1.1931e-02,  1.6537e-02, -3.1542e-02,  ...,  2.1843e-02,
           -5.4999e-02,  5.2344e-02],
          ...,
          [-4.3831e-02,  5.8190e-02,  6.1529e-03,  ...,  2.6180e-02,
            3.5589e-02,  2.7540e-02],
          [ 5.3151e-02, -2.2938e-03,  5.6995e-02,  ..., -6.9736e-02,
           -6.6393e-03,  6.1052e-02],
          [ 6.7908e-02,  5.0293e-02, -8.6554e-04,  ..., -2.2702e-02,
            7.0935e-02, -5.2366e-02]],

         [[ 5.3263e-02, -6.6813e-02, -9.3752e-03,  ...,  1.7874e-03,
            6.9143e-02, -4.9685e-04],
          [-4.6935e-02,  5.1320e-02, -6.1335e-02,  ...,  5.7983e-02,
           -5.2848e-03,  6.9119e-03],
          [-4.5263e-02,  6.9534e-03,  2.3930e-03,  ...,  5.8226e-0

TypeError: exceptions must derive from BaseException

In [None]:
visualize_predict(model)

In [6]:
model._getModel()

[INFO] Initializing vit_small with patch size of 8


VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(8, 8), stride=(8, 8))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU()
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
  (head): Identity()
)

In [7]:
# model.save("model.pth")