# Inception Module

The Inception module introduces a significant departure from earlier architectures like **AlexNet** and **ZF-Net**, which relied on a fixed convolution filter size for each layer. Instead, the Inception architecture performs multiple convolutions with different filter sizes in parallel — specifically **1×1**, **3×3**, and **5×5** — along with a **3×3 max pooling** operation.

Each of these filter sizes serves a distinct purpose:

- **1×1 convolutions**: Help with dimensionality reduction and capture fine-grained, local features.  
- **3×3 convolutions**: Focus on mid-level spatial patterns.  
- **5×5 convolutions**: Designed to detect more abstract, broader features.  
- **3×3 max pooling**: A commonly used technique in deep networks, included to enhance performance through feature downsampling.

All these operations are applied **in parallel** to the same input, and their outputs are **concatenated (depth-wise)** to form the final output of the module.

> The key idea: By combining filters of multiple sizes, the Inception module can effectively capture features at different spatial scales, improving its ability to detect both fine details and global patterns.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms, datasets
from torchsummary import summary

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# The GoogLeNet model

In [4]:
import torch
import torch.nn as nn


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the basic convolution block
class ConvBlock(nn.Module):
  def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
    super(ConvBlock, self).__init__()

    self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
    self.bn = nn.BatchNorm2d(out_channels)
    self.relu = nn.ReLU()

  def forward(self, x):
    return self.relu(self.bn(self.conv(x)))

# Building the inception block
class Inception(nn.Module):
  def __init__(self, in_channels, num1x1, num3x3_reduce, num3x3, num5x5_reduce, num5x5, pool_proj):
    super(Inception, self).__init__()

    # 4 output channel for each parallel block of network
    # blocks are ran parallely not sequentially
    self.block1 = nn.Sequential(
      ConvBlock(in_channels, num1x1, kernel_size=1, stride=1, padding=0)
    )
    self.block2 = nn.Sequential(
      ConvBlock(in_channels, num3x3_reduce, kernel_size=1, stride=1, padding=0),
      ConvBlock(num3x3_reduce, num3x3, kernel_size=3, stride=1, padding=1)
    )
    self.block3 = nn.Sequential(
      ConvBlock(in_channels, num5x5_reduce, kernel_size=1, stride=1, padding=0),
      ConvBlock(num5x5_reduce, num5x5, kernel_size=5, stride=1, padding=2)
    )
    self.block4 = nn.Sequential(
      nn.MaxPool2d(3, stride=1, padding=1, ceil_mode=True),
      ConvBlock(in_channels, pool_proj, kernel_size=1, stride=1, padding=0)
    )

  def forward(self, x):
    block1 = self.block1(x)
    block2 = self.block2(x)
    block3 = self.block3(x)
    block4 = self.block4(x)

    return torch.cat([block1, block2, block3, block4], 1) # N * filters * 28 * 28 here 1 denotes that we are concatinating the filter size

class Auxiliary(nn.Module):
  def __init__(self, in_channels, num_classes):
    super(Auxiliary, self).__init__()

    self.pool = nn.AdaptiveAvgPool2d((4,4))
    self.conv = nn.Conv2d(in_channels, 128, kernel_size=1, stride=1, padding=0)
    self.relu = nn.ReLU()
    self.fc1 = nn.Linear(2048, 1024)
    self.dropout = nn.Dropout(0.7)
    self.fc2 = nn.Linear(1024, num_classes)

  def forward(self, x):
    out = self.pool(x)
    out = self.conv(out)
    out = self.relu(out)
    # ao, a conv output of [batch_size, num_channel, height, width] should be “flattened” to become a `[batch_size, num_channel x height x width]` tensor.
    # and the in_`features` of the linear layer should be set to `[num_channel * height * width]`
    out = torch.flatten(out,1) # conv will output a 4 dim tensor but fc1 requires 2 dim vector
    out = self.fc1(out)
    out = self.relu(out)
    out = self.dropout(out)
    out = self.fc2(out)

    return out

class GoogLeNet(nn.Module):
  def __init__(self, num_classes=10):
    super(GoogLeNet, self).__init__()

    self.conv1 = ConvBlock(3, 64, kernel_size=7, stride=2, padding=3)
    self.pool1 = nn.MaxPool2d(3, stride=2, padding=0, ceil_mode=True)
    self.conv2 = ConvBlock(64, 64, kernel_size=1, stride=1, padding=0)
    self.conv3 = ConvBlock(64, 192, kernel_size=3, stride=1, padding=1)
    self.pool3 = nn.MaxPool2d(3, stride=2, padding=0, ceil_mode=True)

    self.inception3A = Inception(in_channels=192, num1x1=64, num3x3_reduce=96, num3x3=128, num5x5_reduce=16, num5x5=32, pool_proj=32)
    self.inception3B = Inception(in_channels=256, num1x1=128, num3x3_reduce=128, num3x3=192, num5x5_reduce=32, num5x5=96, pool_proj=64)
    self.pool4 = nn.MaxPool2d(3, stride=2, padding=0, ceil_mode=True)

    self.inception4A = Inception(in_channels=480, num1x1=192, num3x3_reduce=96, num3x3=208, num5x5_reduce=16, num5x5=48, pool_proj=64)
    self.inception4B = Inception(in_channels=512, num1x1=160, num3x3_reduce=112, num3x3=224, num5x5_reduce=24, num5x5=64, pool_proj=64)
    self.inception4C = Inception(in_channels=512, num1x1=128, num3x3_reduce=128, num3x3=256, num5x5_reduce=24, num5x5=64, pool_proj=64)
    self.inception4D = Inception(in_channels=512, num1x1=112, num3x3_reduce=144, num3x3=288, num5x5_reduce=32, num5x5=64, pool_proj=64)
    self.inception4E = Inception(in_channels=528, num1x1=256, num3x3_reduce=160, num3x3=320, num5x5_reduce=32, num5x5=128, pool_proj=128)
    self.pool5 = nn.MaxPool2d(3, stride=2, padding=0, ceil_mode=True)

    self.inception5A = Inception(in_channels=832, num1x1=256, num3x3_reduce=160, num3x3=320, num5x5_reduce=32, num5x5=128, pool_proj=128)
    self.inception5B = Inception(in_channels=832, num1x1=384, num3x3_reduce=192, num3x3=384, num5x5_reduce=48, num5x5=128, pool_proj=128)
    self.pool6 = nn.AdaptiveAvgPool2d((1, 1))

    self.dropout = nn.Dropout(0.4)
    self.fc = nn.Linear(1024, num_classes)
    self.aux4A = Auxiliary(512, num_classes)
    self.aux4D = Auxiliary(528, num_classes)

  def forward(self, x):
    out = self.conv1(x)
    out = self.pool1(out)
    out = self.conv2(out)
    out = self.conv3(out)
    out = self.pool3(out)
    out = self.inception3A(out)
    out = self.inception3B(out)
    out = self.pool4(out)
    out = self.inception4A(out)
    aux1 = self.aux4A(out)
    out = self.inception4B(out)
    out = self.inception4C(out)
    out = self.inception4D(out)
    aux2 = self.aux4D(out)
    out = self.inception4E(out)
    out = self.pool5(out)
    out = self.inception5A(out)
    out = self.inception5B(out)
    out = self.pool6(out)
    out = torch.flatten(out, 1)
    out = self.dropout(out)
    out = self.fc(out)

    return out, aux1, aux2


# Training Code

In [11]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms, datasets

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

def train_model(model, train_loader, val_loader, criterion, optimizer):
  EPOCHS = 15
  train_samples_num = 45000
  val_samples_num = 5000
  train_epoch_loss_history, val_epoch_loss_history = [], []

  for epoch in range(EPOCHS): # loop for each epoch

    train_running_loss = 0
    correct_train = 0

    model.train()
    model.to(device)

    for inputs, labels in train_loader: # loop for each batch
      inputs, labels = inputs.to(device), labels.to(device)
      optimizer.zero_grad()

      # forward pass
      prediction0, aux_pred1, aux_pred2 = model(inputs)

      # backward pass
      real_loss = criterion(prediction0, labels)
      aux_loss1 = criterion(aux_pred1, labels)
      aux_loss2 = criterion(aux_pred2, labels)

      loss = real_loss + 0.3 * aux_loss1 + 0.3 * aux_loss2

      # backward pass
      loss.backward()
      optimizer.step()

      # update the correct values
      _, predicted = torch.max(prediction0.data, 1) # dim=1 means across the rows
      correct_train += (predicted == labels).float().sum().item()

      # uptil now we have calculated the avg loss
      # so no we will have to calculate the batch loss as well
      # for that we multiply avg batch loss with the batch length
      train_running_loss += loss.data.item() * inputs.shape[0] # 0 ele of inputs is always the batch size

    train_epoch_loss = train_running_loss / train_samples_num
    train_epoch_loss_history.append(train_epoch_loss)

    train_acc = correct_train / train_samples_num

    val_loss = 0
    correct_val = 0

    model.eval()
    model.to(device)

    with torch.no_grad(): # computign the val accuracy so we switch off the gradient calculcation
      for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass.
        prediction0, aux_pred_1, aux_pred_2 = model(inputs)

        # Compute the loss
        real_loss = criterion(prediction0, labels)
        aux_loss_1 = criterion(aux_pred_1, labels)
        aux_loss_2 = criterion(aux_pred_2, labels)

        loss = real_loss + 0.3 * aux_loss_1 + 0.3 * aux_loss_2

        # Compute training accuracy
        _, predicted = torch.max(prediction0.data, 1)
        correct_val += (predicted == labels).float().sum().item()

        # Compute batch loss
        val_loss += loss.data.item() * inputs.shape[0]

      val_loss /= val_samples_num
      val_epoch_loss_history.append(val_loss)
      val_acc = correct_val / val_samples_num

    info = "[For Epoch {}/{}]: train-loss = {:0.5f} | train-acc = {:0.3f} | val-loss = {:0.5f} | val-acc = {:0.3f}"

    print(info.format(epoch + 1, EPOCHS, train_epoch_loss, train_acc, val_loss, val_acc))

    torch.save(model.state_dict(), "/content/sample_data/checkpoint{}".format(epoch + 1))

  torch.save(model.state_dict(), "/content/sample_data/googlenet_model")

  return train_epoch_loss_history, val_epoch_loss_history


In [6]:
model = GoogLeNet()

model.to(device)
summary(model, (3, 96, 96))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 48, 48]           9,472
       BatchNorm2d-2           [-1, 64, 48, 48]             128
              ReLU-3           [-1, 64, 48, 48]               0
         ConvBlock-4           [-1, 64, 48, 48]               0
         MaxPool2d-5           [-1, 64, 24, 24]               0
            Conv2d-6           [-1, 64, 24, 24]           4,160
       BatchNorm2d-7           [-1, 64, 24, 24]             128
              ReLU-8           [-1, 64, 24, 24]               0
         ConvBlock-9           [-1, 64, 24, 24]               0
           Conv2d-10          [-1, 192, 24, 24]         110,784
      BatchNorm2d-11          [-1, 192, 24, 24]             384
             ReLU-12          [-1, 192, 24, 24]               0
        ConvBlock-13          [-1, 192, 24, 24]               0
        MaxPool2d-14          [-1, 192,

# Loading CIFAR-10


In [7]:
def cifar_dataloader():
  transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.5], std=[0.5])])

  train_dataset = datasets.CIFAR10('/content/drive/MyDrive/All_Datasets/CIFAR10', train=True, download=True, transform=transform)
  test_dataset = datasets.CIFAR10('/content/drive/MyDrive/All_Datasets/CIFAR10', train=False, download=True, transform=transform)

  # Split dataset into training set and validation set.
  train_dataset, val_dataset = random_split(train_dataset, (45000, 5000))

  print("Image shape of a random sample image : {}".format(train_dataset[0][0].numpy().shape), end = '\n\n')

  print("Training Set:   {} images".format(len(train_dataset)))
  print("Validation Set:   {} images".format(len(val_dataset)))
  print("Test Set:       {} images".format(len(test_dataset)))

  BATCH_SIZE = 128

  # Generate dataloader
  train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

  return train_loader, val_loader, test_loader

In [8]:
train_loader, val_loader, test_loader = cifar_dataloader()

100%|██████████| 170M/170M [00:06<00:00, 27.6MB/s]


Image shape of a random sample image : (3, 32, 32)

Training Set:   45000 images
Validation Set:   5000 images
Test Set:       10000 images


# Training the model

In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [12]:
train_epoch_loss_history, val_epoch_loss_history = train_model(model, train_loader, val_loader, criterion, optimizer)

[For Epoch 1/15]: train-loss = 2.37080 | train-acc = 0.459 | val-loss = 2.03434 | val-acc = 0.553
[For Epoch 2/15]: train-loss = 1.69682 | train-acc = 0.629 | val-loss = 1.66142 | val-acc = 0.640
[For Epoch 3/15]: train-loss = 1.39172 | train-acc = 0.701 | val-loss = 1.39228 | val-acc = 0.702
[For Epoch 4/15]: train-loss = 1.16384 | train-acc = 0.755 | val-loss = 1.31606 | val-acc = 0.724
[For Epoch 5/15]: train-loss = 1.00401 | train-acc = 0.789 | val-loss = 1.30383 | val-acc = 0.732
[For Epoch 6/15]: train-loss = 0.86572 | train-acc = 0.817 | val-loss = 1.30582 | val-acc = 0.733
[For Epoch 7/15]: train-loss = 0.75473 | train-acc = 0.845 | val-loss = 1.27222 | val-acc = 0.742
[For Epoch 8/15]: train-loss = 0.65294 | train-acc = 0.864 | val-loss = 1.19393 | val-acc = 0.754
[For Epoch 9/15]: train-loss = 0.55171 | train-acc = 0.884 | val-loss = 1.26246 | val-acc = 0.765
[For Epoch 10/15]: train-loss = 0.48323 | train-acc = 0.900 | val-loss = 1.27665 | val-acc = 0.762
[For Epoch 11/15]: 

# Evaluating the model

In [13]:
model = GoogLeNet()
model.load_state_dict(torch.load('/content/sample_data/googlenet_model'))

<All keys matched successfully>

In [14]:
num_test_samples = 10000
correct = 0

model.eval().cuda()

with  torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        # Make predictions
        prediction, _, _ = model(inputs)

        # Retrieve predictions indexes
        _, predicted_class = torch.max(prediction.data, 1)

        # Compute number of correct predictions
        correct += (predicted_class == labels).float().sum().item()

test_accuracy = correct / num_test_samples

print('Test accuracy: {}'.format(test_accuracy))

Test accuracy: 0.7481
