In [2]:
import torch

| Parameter               | Meaning                                                     |
| ----------------------- | ----------------------------------------------------------- |
| `expansion_ratio` (`t`) | Multiplier for expanding input channels (used in MBConv)    |
| `output_channels` (`c`) | Output channels after projection                            |
| `num_repeats` (`r`)     | How many times to repeat this block                         |
| `stride` (`s`)          | Stride of the first block in this stage (usually 1 or 2)    |
| `kernel_size` (`k`)     | Size of the depthwise convolution filter (e.g., 3x3 or 5x5) |


In [3]:

from torch import nn

# Each sublist represents one stage in the network
#Basic structure before scaling
basic_mb_params =[
    # t, channels(c), repeats(t), stride(s), kernel_size(k)
    [1,16,1,1,3],
    [6,24,2,2,3],
    [6,40,2,2,5],
    [6,80,3,2,3],
    [6,112,3,1,5],
    [6,192,4,2,5],
    [6,320,1,1,3],


]

alpha, beta = 1.2, 1.1  #depth scaling, width scaling

scale_values = {
    # (phi, resolution, dropout)
    "b0":(0,224,0.2),
    "b1":(0.5,240,0.2),
    "b2":(1,260,0.3),
    "b3":(2,300,0.3),
    "b4":(3,380,0.4),
    "b5":(4,456,0.4),
    "b6":(5,528,0.5),
    "b7":(6,600,0.5),
}

| Activation | Best For                    | Behavior Near 0 | Complexity | Key Trait                   |
| ---------- | --------------------------- | --------------- | ---------- | --------------------------- |
| **ReLU**   | General deep learning       | Linear          | Low        | Fast and simple             |
| **SiLU**   | CNNs (e.g., EfficientNet)   | Smooth, soft    | Moderate   | Boosts performance slightly |
| **GELU**   | NLP, Transformers           | Smooth, noisy   | High       | Better at handling noise    |
| **SinLU**  | Oscillatory or experimental | Oscillatory     | Moderate   | Periodic behavior           |


| Part                  | Purpose                                                            |
| --------------------- | ------------------------------------------------------------------ |
| `nn.Conv2d(...)`      | Performs convolution to extract features (edges, textures, etc.)   |
| `nn.BatchNorm2d(...)` | Normalizes output of Conv2D for stability and faster convergence   |
| `nn.SiLU()`           | Smooth non-linear activation function: $x \cdot \text{sigmoid}(x)$ |


Standard Conv:
Input (3 channels) → 64 filters (3x3x3) → Output (64 channels)

Depthwise Separable Conv:
1. Depthwise Conv (3x3 on each input channel separately) → Output (3 channels)
2. Pointwise Conv (1x1x3 filters to mix channels) → Output (64 channels)


In [7]:
class ConvBlock(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size, stride, padding, groups=1):
    super(ConvBlock, self).__init__()
    self.cnnblock=nn.Sequential(
                  nn.Conv2d(in_channels, out_channels,kernel_size, stride, padding, groups=groups),
                  nn.BatchNorm2d(out_channels),
                  nn.SiLU())

  def forward(self,x):
    return self.cnnblock(x)


| Parameter      | Meaning                                                                      |
| -------------- | ---------------------------------------------------------------------------- |
| `in_channels`  | Number of channels in the input tensor                                       |
| `out_channels` | Channels after projecting features                                           |
| `kernel_size`  | Size of filter in depthwise convolution                                      |
| `stride`       | Controls downsampling (usually 1 or 2)                                       |
| `padding`      | Keeps spatial size same after conv                                           |
| `ratio`        | **Expansion ratio**: determines if input gets expanded before depthwise conv |
| `reduction`    | Factor to reduce channels in **Squeeze-and-Excitation (SE)**                 |


| Stage           | Operation             | Purpose                                     |
| --------------- | --------------------- | ------------------------------------------- |
| Optional        | `expand_conv`         | Increase channel dimension (if needed)      |
| Depthwise Conv  | `groups=hidden_dim`   | Filter spatial features per channel         |
| SE Block        | `SqueezeExcitation`   | Focus on important channels                 |
| Pointwise Conv  | `1x1 conv`            | Mix channel info, reduce to output channels |
| BN & Activation | `BatchNorm2d`, `SiLU` | Normalize & activate features               |


In [8]:
class MBBlock(nn.Module):
  def __init__(self,in_channels,out_channels,kernel_size,stride,padding,ratio,reduction=2,):
    super(MBBlock, self).__init__()
    hidden_dim=in_channels*ratio
    self.expand = in_channels != hidden_dim

    # This is for squeeze and excitation block
    reduced_dim = int(in_channels / reduction)

    if self.expand:
      self.expand_conv = ConvBlock(in_channels,hidden_dim,kernel_size = 3,stride=1,padding=1)

    self.conv = nn.Sequential(
        ConvBlock(hidden_dim,hidden_dim,kernel_size,stride,padding,groups=hidden_dim),
        SqueezeExcitation(hidden_dim,reduced_dim),
        nn.Conv2d(hidden_dim, out_channels, 1),
        nn.BatchNorm2d(out_channels),

    )

  def forward(self,inputs):
    if self.expand:
      x=self.expand_conv(inputs)
    else:
      x=inputs
    return self.conv(x)

| Step        | Component                    | What It Does                              |
| ----------- | ---------------------------- | ----------------------------------------- |
| 1. Squeeze  | `AdaptiveAvgPool2d(1)`       | Global average pooling (reduce H×W → 1×1) |
| 2. Excite   | Conv + SiLU + Conv + Sigmoid | Learn importance of each channel          |
| 3. Reweight | `x * se(x)`                  | Emphasize important channels              |


In [10]:
class SqueezeExcitation(nn.Module):
  def __init__(self,in_channels,reduced_dim):
    super(SqueezeExcitation,self).__init__()
    self.se=nn.Sequential(
        nn.AdaptiveAvgPool2d(1),
        nn.Conv2d(in_channels,reduced_dim,1),
        nn.SiLU(),
        nn.Conv2d(reduced_dim,in_channels,1),
        nn.Sigmoid(),
    )

  def forward(self,x):
    return x * self.se(x)

| **Section**            | **Purpose**                                                                |
| ---------------------- | -------------------------------------------------------------------------- |
| Stem ConvBlock         | Extract low-level features & downsample input image.                       |
| MBConv Block Loop      | Efficiently capture complex spatial patterns via lightweight convolutions. |
| Width/Depth Scaling    | Control model size dynamically across EfficientNet variants.               |
| Squeeze-and-Excitation | Add channel-wise attention to focus on important features.                 |
| Final Projection Block | Aggregate and project features into a unified high-level representation.   |
| `nn.Sequential`        | Bundle all feature layers into one forward pass.                           |


In [14]:
from math import ceil

| Stage          | What It Does                                                                |
| -------------- | --------------------------------------------------------------------------- |
| **Stem**       | Initial `ConvBlock` that reduces input size and increases channels          |
| **Body**       | Stacked **MBConv** blocks (Mobile Inverted Bottlenecks with SE + expansion) |
| **Head**       | 1x1 conv layer to increase channels to a fixed `last_channels` size         |
| **Pooling**    | Global average pooling to convert spatial info into vector form             |
| **Classifier** | Dropout + linear layer for final prediction                                 |


In [20]:
class EfficientNet(nn.Module):
  def __init__(self,model_name,output):
    super(EfficientNet,self).__init__()
    phi,resolution,dropout=scale_values[model_name]
    self.depth_factor,self.width_factor=alpha**phi,beta**phi

    #Output of the last MBConv block is adjusted to a fixed dimensionality (scaled)
    self.last_channels=ceil(1280*self.width_factor)

    #Global average pooling compresses [B, C, H, W] → [B, C, 1, 1]
    self.avgpool=nn.AdaptiveAvgPool2d(1)

    #Flatten turns it into a vector [B, C] for classification
    self.flatten = nn.Flatten()

    self.classifier = nn.Sequential(
        nn.Dropout(dropout),
        nn.Linear(self.last_channels, output),
    )


   def feature_extractor(self):
      channels = int(32 * self.width_factor)
      features = [ConvBlock(3,channels,3,stride=2,padding=1)]
      in_channels=channels

      for k,c_o,repeats,n in basic_mb_params:
        out_channels = 4 * ceil(int(c_o * self.width_factor)/4) #Many hardware accelerators (like GPUs or TPUs) are optimized for processing channel counts that are multiples of 4, 8, 16, etc.
        num_layers=ceil(repeat * self.depth_factor)

        for layer in range(num_layers):
          if layer ==0 :
            stride = s
          else:
            stride=1
          features.append(
              MBBlock(in_channels,out_channels,expand_ratio=k,stride=stride,kernel_size=n,padding=n//2))
          in_channels = out_channels

        features.append(ConvBlock(in_channels,self.last_channels,kernel_size=1,stride=1,padding=0))
        self.extractor = nn.Sequential(*features)

    def forward(self,x):
      x=self.avgpool(self.extractor(x))
      return self.classifier(self.flatten(x))

model_name='b1'
output_class = 1000 # for imagenet
effnet=EfficientNet(model_name,output_class)
