In [1]:
# Google Drive + working directory path
from google.colab import drive
drive.mount('/content/drive')

PATH = '/content/drive/MyDrive/Vision24/'
VERSION = 'MobileNetV2'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

print(os.path.isdir(PATH))
assert os.path.isdir(PATH)

True


In [3]:
CHECKPOINT_PATH = os.path.join(PATH, VERSION)
os.makedirs(CHECKPOINT_PATH, exist_ok=True)


print(os.path.isdir(CHECKPOINT_PATH))
assert os.path.isdir(CHECKPOINT_PATH)

True


In [4]:
# verify >16GB of RAM
!nvidia-smi

Tue Nov 19 06:15:27 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   43C    P0              47W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
!pip install torch
!pip install zetascale
!pip install swarms
!pip install torchinfo

import torch
from einops import rearrange, repeat
from einops.layers.torch import Rearrange
from torch import nn, Tensor
from zeta.nn import SSM
from einops.layers.torch import Reduce

In [None]:
# Squeeze and Excitation block
from torch.nn import AdaptiveAvgPool1d, Linear, Sigmoid, ReLU

class MobileNetV2_DepthwiseConv(nn.Module):
    def __init__(self, in_channels, out_channels, expansion_factor=4):
        super(MobileNetV2_DepthwiseConv, self).__init__()

        # Calculate the number of expanded channels (for the bottleneck structure)
        self.expansion_channels = in_channels * expansion_factor

        # Step 1: Expansion Layer
        self.expand_conv = nn.Conv1d(
            in_channels=in_channels,
            out_channels=self.expansion_channels,
            kernel_size=1,
            bias=False
        )
        self.expand_bn = nn.BatchNorm1d(self.expansion_channels)  # Normalize the expanded channels
        self.expand_activation = nn.SiLU()  # Non-linear activation after expansion


        # Step 2: Depthwise Convolution
        self.depthwise_conv = nn.Conv1d(
            in_channels=self.expansion_channels,
            out_channels=self.expansion_channels,
            kernel_size=3,
            stride=1,
            padding=1,
            groups=self.expansion_channels,  # Depthwise convolution
            bias=False
        )
        self.depthwise_bn = nn.BatchNorm1d(self.expansion_channels)  # Normalize after depthwise convolution
        self.depthwise_activation = nn.SiLU()  # Non-linear activation


        # Step 3: Projection Layer
        self.project_conv = nn.Conv1d(
            in_channels=self.expansion_channels,
            out_channels=out_channels,
            kernel_size=1,
            bias=False
        )
        self.project_bn = nn.BatchNorm1d(out_channels)  # Normalize the projected output


    def forward(self, x):
        skip = x  # Save input for residual connection (if applicable)

        # Step 1: Expand input channels
        x = self.expand_conv(x)
        x = self.expand_bn(x)
        x = self.expand_activation(x)

        # Step 2: Apply depthwise convolution
        x = self.depthwise_conv(x)
        x = self.depthwise_bn(x)
        x = self.depthwise_activation(x)

        # Step 3: Project back to the original dimensionality
        x = self.project_conv(x)
        x = self.project_bn(x)

        # Add residual connection
        if skip.shape == x.shape:
            x += skip

        return x


In [7]:
def pair(t):
    return t if isinstance(t, tuple) else (t, t)


def output_head(dim: int, num_classes: int):
    """
    Creates a head for the output layer of a model.

    Args:
        dim (int): The input dimension of the head.
        num_classes (int): The number of output classes.

    Returns:
        nn.Sequential: The output head module.
    """
    return nn.Sequential(
        Reduce("b s d -> b d", "mean"),
        nn.LayerNorm(dim),
        nn.Linear(dim, num_classes),
    )


class VisionEncoderMambaBlock(nn.Module):
    def __init__(
        self,
        dim: int,
        dt_rank: int,
        dim_inner: int,
        d_state: int,
    ):
        super().__init__()
        self.dim = dim
        self.dt_rank = dt_rank
        self.dim_inner = dim_inner
        self.d_state = d_state

        # Depthwise separable convolutions
        self.forward_conv = MobileNetV2_DepthwiseConv(in_channels=dim, out_channels=dim)
        self.backward_conv = MobileNetV2_DepthwiseConv(in_channels=dim, out_channels=dim)

        self.norm = nn.LayerNorm(dim)
        self.silu = nn.SiLU()
        self.ssm = SSM(dim, dt_rank, dim_inner, d_state)

        self.proj = nn.Linear(dim, dim) # projection layer
        self.softplus = nn.Softplus()


    def forward(self, x: torch.Tensor):
        b, s, d = x.shape

        skip = x            # skip connection
        x = self.norm(x)
        z = self.silu(self.proj(x))   # project --> activation for gating
        x = self.proj(x)

        x1 = self.process_direction(x, self.forward_conv, self.ssm,)
        x2 = self.process_direction(x, self.backward_conv, self.ssm,)

        x1 *= z
        x2 *= z

        x = x1 + x2
        return x + skip

    def process_direction(
        self,
        x: Tensor,
        conv1d: nn.Conv1d,
        ssm: SSM,
    ):
        x = rearrange(x, "b s d -> b d s")
        x = self.softplus(conv1d(x))
        x = rearrange(x, "b d s -> b s d")
        x = ssm(x)
        return x


class Vim(nn.Module):
    def __init__(
        self,
        dim: int,
        dt_rank: int = 32,
        dim_inner: int = None,
        d_state: int = None,
        num_classes: int = None,
        image_size: int = 224,
        patch_size: int = 16,
        channels: int = 3,
        dropout: float = 0.1,
        depth: int = 12,
        *args,
        **kwargs,
    ):
        super().__init__()
        self.dim = dim
        self.dt_rank = dt_rank
        self.dim_inner = dim_inner
        self.d_state = d_state
        self.num_classes = num_classes
        self.image_size = image_size
        self.patch_size = patch_size
        self.channels = channels
        self.dropout = dropout
        self.depth = depth

        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)
        patch_dim = channels * patch_height * patch_width

        self.to_patch_embedding = nn.Sequential(
            Rearrange(
                "b c (h p1) (w p2) -> b (h w) (p1 p2 c)",
                p1=patch_height,
                p2=patch_height,
            ),
            nn.Linear(patch_dim, dim),
        )

        self.dropout = nn.Dropout(dropout)
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.to_latent = nn.Identity()
        self.layers = nn.ModuleList()        # encoder layers
        for _ in range(depth):
            self.layers.append(
                VisionEncoderMambaBlock(
                    dim=dim,
                    dt_rank=dt_rank,
                    dim_inner=dim_inner,
                    d_state=d_state,
                    *args,
                    **kwargs,
                )
            )
        self.output_head = output_head(dim, num_classes)

    def forward(self, x: Tensor):
        b, c, h, w = x.shape
        x = self.to_patch_embedding(x)
        b, n, _ = x.shape
        cls_tokens = repeat(self.cls_token, "() n d -> b n d", b=b)
        # x = torch.cat((cls_tokens, x), dim=1)
        x = self.dropout(x)
        for layer in self.layers:
            x = layer(x)

        x = self.to_latent(x)
        # x = reduce(x, "b s d -> b d", "mean")
        return self.output_head(x)

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from einops import repeat

model = Vim(dim=96,
            dt_rank=16,
            dim_inner=96,
            d_state=96,
            num_classes=10,
            image_size=32,
            patch_size=4,
            channels=3,
            dropout=0.1,
            depth=10,)

batch_size = 128
epochs = 100

transform = transforms.Compose([
    # transforms.Resize([224,224]),
    transforms.ToTensor(),
    transforms.Normalize((0.5,0.5,0.5),(0.5,0.5,0.5))
])

training_data = torchvision.datasets.CIFAR10(root = './dataa',train=True,download=True,transform=transform)
testing_data = torchvision.datasets.CIFAR10(root = './data',train=False,download=True,transform=transform)

training_loader = DataLoader(training_data, batch_size=batch_size, shuffle=True, num_workers=4)
testing_loader = DataLoader(testing_data, batch_size=batch_size, shuffle=True, num_workers=4)

Files already downloaded and verified
Files already downloaded and verified


In [9]:
print('epochs: ', epochs)
print('Training data: ', len(training_loader))
print('Total steps: ', epochs*len(training_loader))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

epochs:  100
Training data:  391
Total steps:  39100
Device: cuda


In [10]:
import time
import pandas as pd
import numpy as np

_start = time.time()
print(_start)

_training_losses = []
__n = 100

1731996936.9581318


In [12]:
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.003)

accumulation_steps = 2  # Simulates batch_size = 256 with batch_size = 128

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    epoch_loss = 0.0

    optimizer.zero_grad()  # Ensure gradients are zeroed before accumulation
    for i, (inputs, labels) in enumerate(training_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Normalize loss for accumulation
        loss = loss / accumulation_steps
        loss.backward()

        # Update weights every `accumulation_steps` iterations
        if (i + 1) % accumulation_steps == 0 or (i + 1) == len(training_loader):
            optimizer.step()
            optimizer.zero_grad()  # Reset gradients after step

        running_loss += loss.item() * accumulation_steps  # Re-scale loss back
        epoch_loss += loss.item() * accumulation_steps  # Re-scale loss back

        if (i + 1) % __n == 0:  # For monitoring intermediate steps
            print(f"Epoch [{epoch + 1}/{epochs}], Step [{i + 1}/{len(training_loader)}], "
                  f"Loss: {running_loss / __n:.4f}")
            running_loss = 0.0


    epoch_loss = epoch_loss / len(training_loader)
    print(f"Epoch {epoch + 1} finished. Loss: {epoch_loss}")
    _training_losses.append(epoch_loss)


Epoch [1/100], Step [100/391], Loss: 1.9594
Epoch [1/100], Step [200/391], Loss: 1.7235
Epoch [1/100], Step [300/391], Loss: 1.6260
Epoch 1 finished. Loss: 1.7170702795238446
Epoch [2/100], Step [100/391], Loss: 1.4823
Epoch [2/100], Step [200/391], Loss: 1.3970
Epoch [2/100], Step [300/391], Loss: 1.3747
Epoch 2 finished. Loss: 1.3978873533970864
Epoch [3/100], Step [100/391], Loss: 1.2691
Epoch [3/100], Step [200/391], Loss: 1.2665
Epoch [3/100], Step [300/391], Loss: 1.2338
Epoch 3 finished. Loss: 1.247247184176579
Epoch [4/100], Step [100/391], Loss: 1.1692
Epoch [4/100], Step [200/391], Loss: 1.1555
Epoch [4/100], Step [300/391], Loss: 1.1428
Epoch 4 finished. Loss: 1.1573464629595236
Epoch [5/100], Step [100/391], Loss: 1.0822
Epoch [5/100], Step [200/391], Loss: 1.0968
Epoch [5/100], Step [300/391], Loss: 1.0845
Epoch 5 finished. Loss: 1.0835438567354245
Epoch [6/100], Step [100/391], Loss: 1.0093
Epoch [6/100], Step [200/391], Loss: 1.0142
Epoch [6/100], Step [300/391], Loss: 1

In [13]:
_end = time.time()
torch.save(model.state_dict(), os.path.join(CHECKPOINT_PATH, f'final_checkpoint.pt'))

In [14]:
print(_end)

with open(os.path.join(CHECKPOINT_PATH, 'trainning_time.txt'), 'w') as f:
  f.write(f"Start Time: {_start}\nEnd Time:{_end}\nTotal Time:{_end - _start}")


DF = pd.DataFrame(np.array(_training_losses))
DF.to_csv(os.path.join(CHECKPOINT_PATH, 'losses.csv'))


1732013514.0293493


In [15]:

from torchinfo import summary
summary(model, input_size=(batch_size, 3, 32, 32))

Layer (type:depth-idx)                             Output Shape              Param #
Vim                                                [128, 10]                 96
├─Sequential: 1-1                                  [128, 64, 96]             --
│    └─Rearrange: 2-1                              [128, 64, 48]             --
│    └─Linear: 2-2                                 [128, 64, 96]             4,704
├─Dropout: 1-2                                     [128, 64, 96]             --
├─ModuleList: 1-3                                  --                        --
│    └─VisionEncoderMambaBlock: 2-3                [128, 64, 96]             --
│    │    └─LayerNorm: 3-1                         [128, 64, 96]             192
│    │    └─Linear: 3-2                            [128, 64, 96]             9,312
│    │    └─SiLU: 3-3                              [128, 64, 96]             --
│    │    └─Linear: 3-4                            [128, 64, 96]             (recursive)
│    │    └─MobileN

In [16]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs,labels in testing_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f"accuracy of the model test images: {(100 * correct / total):.2f}%")

accuracy of the model test images: 68.27%


In [17]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  41346 KiB |  20916 MiB |   5628 TiB |   5628 TiB |
|       from large pool |  18176 KiB |  20883 MiB |   5626 TiB |   5626 TiB |
|       from small pool |  23170 KiB |     38 MiB |      1 TiB |      1 TiB |
|---------------------------------------------------------------------------|
| Active memory         |  41346 KiB |  20916 MiB |   5628 TiB |   5628 TiB |
|       from large pool |  18176 KiB |  20883 MiB |   5626 TiB |   5626 TiB |
|       from small pool |  23170 KiB |     38 MiB |      1 TiB |      1 TiB |
|---------------------------------------------------------------