In [10]:
#SFCNN WITHOUT EXPANDING

import torch
import torch.nn as nn
import torch.nn.functional as F

class GSiLU(nn.Module):
    """Global Sigmoid Linear Unit proposed in the paper
    Returns x * sigmoid(global average pooling of x)
    """
    def __init__(self):
        super(GSiLU, self).__init__()
        self.gap = nn.AdaptiveAvgPool2d(1)

    def forward(self, x):
        gap = self.gap(x)
        return x * torch.sigmoid(gap)


class DWCONV(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super(DWCONV, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=in_channels, bias=False)


activation = nn.SiLU()
class SFCNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, downsample=False):
        super(SFCNNBlock, self).__init__()

        self.downsample = downsample #used in Type 2 with downsample
        if downsample : stride=2

        #1 Applying 3x3 DWConv (groups parameter separates channels, performing depthwise convolution)
        self.dwconv1 = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=stride, padding=1, groups=in_channels, bias=False)
        #2 Pass through PWConv and SiLU
        self.pwconv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False)
        self.silu = nn.SiLU()
        #3 Applying 3x3 DWConv
        self.dwconv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, groups=out_channels, bias=False)
        #4 Pass through GSILU
        self.gsilu = activation
        #5 Pass through PWConv
        self.pwconv2 = nn.Conv2d(out_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False)

        #7 Passing x through these at the end
        if self.downsample:
            self.layernorm = nn.LayerNorm(in_channels)
            self.downsample_conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
            self.downsample_pw = nn.Conv2d(out_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False)

    def forward(self, x):
        input = x
        out = x

        if self.downsample:
            B, C, H, W = out.size()
            out = out.permute(0, 2, 3, 1)  # B, H, W, C (need to put C in last pos for LN)
            out = self.layernorm(out) # apply layernorm in the beginning
            # print("LayerNorm finished")
            out = out.permute(0, 3, 1, 2) # back to original shape

            #7 If downsample is True, apply 3x3 DWConv and PWConv to input x
            input_downsampled = self.downsample_conv(x)
            input_downsampled = self.downsample_pw(input_downsampled)

        out = self.dwconv1(input)
        # print("DWConv1 finished")
        out = self.pwconv1(out)
        # print("PWConv1 finished")
        out = self.silu(out)

        out = self.dwconv2(out)
        # print("DWConv2 finished")
        out = self.gsilu(out)
        out = self.pwconv2(out) # Output of step 5
        # print("PWConv2 finished")


        #6 Input of step 1 and output of step 5 are added
        # print("Input shape: ", input.shape)
        # print("Output shape: ", out.shape)
        out += input_downsampled if self.downsample else input
        return out


class SFCNN(nn.Module):
    def __init__(self, num_classes=1000, block_numbers=[4, 8, 20, 4], channels=[48, 96, 192, 384]):
        super(SFCNN, self).__init__()

        self.stem = nn.Conv2d(3, channels[0], kernel_size=3, stride=2, padding=1, bias=False)

        self.stage1 = self.make_stage(block_numbers[0], channels[0], channels[1], stride=2)
        self.stage2 = self.make_stage(block_numbers[1], channels[1], channels[2], stride=2)
        self.stage3 = self.make_stage(block_numbers[2], channels[2], channels[3], stride=2)
        self.stage4 = self.make_stage(block_numbers[3], channels[3], channels[3], stride=1)

        self.last_conv = nn.Conv2d(channels[3], 1024, kernel_size=1, stride=1, padding=0, bias=False)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Sequential(
            nn.Linear(1024, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, num_classes)
        )

    def make_stage(self, block_nb, in_channels, out_channels, stride):
        layers = []
        layers.append(SFCNNBlock(in_channels, out_channels, stride=stride, downsample=True))
        for _ in range(1, block_nb):
            layers.append(SFCNNBlock(out_channels, out_channels, stride=1))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.stem(x)
        # print("Stem finished")
        x = self.stage1(x)
        # print("Stage 1 finished")
        x = self.stage2(x)
        # print("Stage 2 finished")
        x = self.stage3(x)
        # print("Stage 3 finished")
        x = self.stage4(x)
        # print("Stage 4 finished")

        x = self.last_conv(x)
        # print("Last Conv finished")

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)

        return x

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter
# Hyperparamètres
batch_size = 128
num_epochs = 100
learning_rate = 0.001
weight_decay = 0.05
warmup_epochs = 5

# Preprocessing and augmentation
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2470, 0.2435, 0.2616]),  # CIFAR stats
])

val_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2470, 0.2435, 0.2616]),
])

#loading the dataset
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transform)
val_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)

# Init the model, loss function, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = 10 #100 for cifar100

Files already downloaded and verified
Files already downloaded and verified




In [None]:
writer = SummaryWriter("/content/drive/MyDrive/runs/sfcnn_pico_cifar10_gsilu")

print("Creating SFCNN-P_silu")
channels = [32*2**i for i in range(4)]
model = SFCNN(num_classes=num_classes, block_numbers=[3, 4, 12, 3], channels=channels).to(device)
print("Executing SFCNN-P")
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs - warmup_epochs)

# Warmup scheduler
def warmup_scheduler(epoch, warmup_epochs, optimizer):
    if epoch < warmup_epochs:
        lr = learning_rate * (epoch + 1) / warmup_epochs
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    warmup_scheduler(epoch, warmup_epochs, optimizer)

    for i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        #print("starting forward pass")
        outputs = model(inputs)
        #print("outputs calculated")
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        # if i % 100 == 0:
        #     print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}, Acc: {100.*correct/total:.2f}%')
    train_loss = running_loss / len(train_loader)
    train_acc = 100. * correct / total
    writer.add_scalar('Loss/Train', train_loss, epoch)
    writer.add_scalar('Accuracy/Train', train_acc, epoch)
    writer.add_scalar('Learning Rate', optimizer.param_groups[0]['lr'], epoch)

    if epoch >= warmup_epochs:
        scheduler.step()

    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = outputs.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()

    val_loss /= len(val_loader)
    val_acc = 100. * val_correct / val_total
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {val_loss:.4f}, Validation Acc: {val_acc:.2f}%')
    writer.add_scalar('Loss/Validation', val_loss, epoch)
    writer.add_scalar('Accuracy/Validation', val_acc, epoch)

# Save the final model
#torch.save(model.state_dict(), 'models/sfcnn_tiny_cifar10_run2.pth')
writer.close()



Creating SFCNN-P_silu
Executing SFCNN-P
Epoch [1/100], Validation Loss: 1.8120, Validation Acc: 33.94%
Epoch [2/100], Validation Loss: 1.7185, Validation Acc: 38.48%
Epoch [3/100], Validation Loss: 1.6807, Validation Acc: 38.68%
Epoch [4/100], Validation Loss: 1.6015, Validation Acc: 41.71%
Epoch [5/100], Validation Loss: 1.5961, Validation Acc: 42.09%
Epoch [6/100], Validation Loss: 1.5327, Validation Acc: 43.83%
Epoch [7/100], Validation Loss: 1.4450, Validation Acc: 46.32%
Epoch [8/100], Validation Loss: 1.4794, Validation Acc: 47.03%
