In [10]:

import torch
import torch.nn as nn

class FeedForwardNetwork(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super(FeedForwardNetwork, self).__init__()
        # Define each layer separately
        self.fc1 = nn.Linear(embed_dim, ff_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(ff_dim, embed_dim)
    
    def forward(self, x):
        # Forward pass through each layer
        x = self.fc1(x)      # First linear layer
        x = self.relu(x)     # ReLU activation
        x = self.fc2(x)      # Second linear layer
        return x


# Example usage
embed_dim = 128
ff_dim = 512
ffn = FeedForwardNetwork(embed_dim, ff_dim)

# Example input tensor of shape (batch_size, seq_length, embed_dim)
x = torch.randn(32, 10, embed_dim)  # Batch of 32 samples, each with a sequence length of 10

# Pass the input through the feed-forward network
output = ffn(x)

print(output.shape)  # Should be (32, 10, embed_dim)import torch
import torch.nn as nn
import numpy as np

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        assert self.head_dim * num_heads == embed_dim, "Embedding dimension must be divisible by number of heads"

        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.out = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        N, L, E = x.shape

        qkv = self.qkv(x)  # (N, L, 3*E)
        qkv = qkv.reshape(N, L, 3, self.num_heads, self.head_dim) #(N, L, 3, num_heds, head_dim)
        qkv = qkv.permute(2,0,3,1,4) # (3, N, num_heads, L, head_dim)

        q, k , v = qkv[0], qkv[1], qkv[2]
        attn_weights = torch.matmul(q, k.transpose(-2,-1)) # (N, num_heads, L, L)
        attn_weights = attn_weights / np.sqrt(self.head_dim)
        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)

        out = torch.matmul(attn_weights, v) # (N, num_heads, L, head_dim)
        out = out.transpose(1,2).reshape(N,L,E) # (N, L, E)
        out = self.out(out)

        return out
import torch
import torch.nn as nn
import numpy as np

class SimpleTransformerLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super(SimpleTransformerLayer, self).__init__()
        self.attention = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = FeedForwardNetwork(embed_dim, ff_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        # Multi-head self-attention
        attn_output = self.attention(x)
        x = self.norm1(x + attn_output)

        # Feed-forward network
        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)

        return x

# Instantiate the transformer layer
embed_dim = 128
num_heads = 8
ff_dim = 512
transformer_layer = SimpleTransformerLayer(embed_dim, num_heads, ff_dim)

# Example input (batch size N, sequence length L, embedding dimension E)
x = torch.randn(32, 10, embed_dim)  # Batch of 32, sequence length of 10, embedding dim of 128

# Pass the input through the transformer layer
output = transformer_layer(x)

print(output.shape)  # Should be (32, 10, 128)

import torch
import torch.nn as nn
import torch.nn.functional as F

class VisionTransformer(nn.Module):
    def __init__(self, img_size, patch_size, embed_dim, num_heads, ff_dim, num_classes):
        super(VisionTransformer, self).__init__()
        
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        
        # Create patches and project them into embedding space
        self.patch_embedding = nn.Conv2d(3, embed_dim, kernel_size=patch_size, stride=patch_size)
        
        # Classification token
        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        
        # Positional embeddings
        num_patches = (img_size // patch_size) ** 2
        self.positional_embedding = nn.Parameter(torch.randn(num_patches + 1, embed_dim))
        
        # Transformer encoder layers
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)  # Simplified to 1 layer for demonstration
        
        # Output classification head
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        # Extract patches and project them to embedding space
        x = self.patch_embedding(x)  # Shape: (N, embed_dim, H', W')
        x = x.flatten(2)  # Shape: (N, embed_dim, num_patches)
        x = x.transpose(1, 2)  # Shape: (N, num_patches, embed_dim)
        
        # Add cls_token
        batch_size = x.size(0)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # Shape: (N, 1, embed_dim)
        x = torch.cat([cls_tokens, x], dim=1)  # Shape: (N, num_patches + 1, embed_dim)
        
        # Add positional embeddings
        x = x + self.positional_embedding  # Shape: (N, num_patches + 1, embed_dim)
        
        # Reorder dimensions for TransformerEncoder (sequence_length, batch_size, embed_dim)
        x = x.transpose(0, 1)  # Shape: (num_patches + 1, N, embed_dim)
        
        # Pass through transformer encoder
        x = self.transformer_encoder(x)  # Shape: (num_patches + 1, N, embed_dim)
        
        # Extract the output corresponding to cls_token
        x = x.transpose(0, 1)  # Shape: (N, num_patches + 1, embed_dim)
        cls_output = x[:, 0]  # Shape: (N, embed_dim)
        
        # Classification head
        logits = self.fc(cls_output)  # Shape: (N, num_classes)
        return logits

# Example usage
img_size = 32
patch_size = 8
embed_dim = 128
num_heads = 8
ff_dim = 512
num_classes = 10

# Create the Vision Transformer model
model = VisionTransformer(img_size, patch_size, embed_dim, num_heads, ff_dim, num_classes)

# Example input tensor: batch of 32x32 RGB images
x = torch.randn(8, 3, img_size, img_size)  # Batch size of 8

# Forward pass
logits = model(x)

print(logits.shape)  # Should be (8, num_classes), i.e., (8, 10)



import torch
import torch.nn as nn

# Define parameters
embed_dim = 128
num_heads = 8
num_layers = 6
ff_dim = 512
batch_size = 10
seq_len_tgt = 20  # Length of the target sequence
seq_len_mem = 15  # Length of the memory sequence

# Create a single TransformerDecoderLayer
decoder_layer = nn.TransformerDecoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim)

# Create a TransformerDecoder with multiple layers
transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)

# Example memory and target tensors
# Memory tensor: shape should be (S, N, E) where S is the source sequence length
memory = torch.randn(seq_len_mem, batch_size, embed_dim)  # (S, N, E)

# Target tensor: shape should be (T, N, E) where T is the target sequence length
tgt = torch.randn(seq_len_tgt, batch_size, embed_dim)     # (T, N, E)

# Pass tensors through the TransformerDecoder
output = transformer_decoder(tgt, memory)

print(output.shape)  # Output shape should be (T, N, E)

torch.Size([32, 10, 128])
torch.Size([32, 10, 128])
torch.Size([8, 10])
torch.Size([20, 10, 128])


In [15]:
cls_t = torch.randn(1,1,4)

In [17]:
cls_t

tensor([[[-0.2666,  1.0577, -0.3493,  0.0475]]])

In [18]:
N=3

In [22]:
cls_t =cls_t.expand(N, -1,-1)

In [23]:
cls_t

tensor([[[-0.2666,  1.0577, -0.3493,  0.0475]],

        [[-0.2666,  1.0577, -0.3493,  0.0475]],

        [[-0.2666,  1.0577, -0.3493,  0.0475]]])

In [24]:
cls_t.shape

torch.Size([3, 1, 4])

In [38]:
b= torch.randn(1,1,3)

In [43]:
b.expand(N, 2, -1)

tensor([[[0.2983, 0.5095, 0.1856],
         [0.2983, 0.5095, 0.1856]],

        [[0.2983, 0.5095, 0.1856],
         [0.2983, 0.5095, 0.1856]],

        [[0.2983, 0.5095, 0.1856],
         [0.2983, 0.5095, 0.1856]]])