In [1]:
%load_ext autoreload
%autoreload 2

# Exercise 5

<img src="./images/05.png" width=800>

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
import torch.optim as optim
import os
import mlflow
from torchinfo import summary

from torch.utils.data import Dataset, DataLoader, random_split
from utils import train_network, set_seed, accuracy_score_wrapper, weight_reset

  from tqdm.autonotebook import tqdm


In [None]:
os.environ['MLFLOW_TRACKING_URI'] = './mlruns08_5'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [None]:
mlflow.set_experiment('Exercise08_5')

<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_07/mlruns07_1/143507330168611334', creation_time=1750415411076, experiment_id='143507330168611334', last_update_time=1750415411076, lifecycle_stage='active', name='Exercise07_1', tags={}>

In [3]:
torch.backends.cudnn.deterministic = True
set_seed(42)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Dataset and DataLoader

In [5]:
batch_size = 256

In [None]:
mnist_train = torchvision.datasets.MNIST("./data", train=True, download=True, transform=transforms.ToTensor())
mnist_test = torchvision.datasets.MNIST("./data", train=False, download=True, transform=transforms.ToTensor())
mnist_train_loader = DataLoader(mnist_train, shuffle=True, batch_size=batch_size,)
mnist_test_loader = DataLoader(mnist_test, batch_size=batch_size,)

In [6]:
cifar_train = torchvision.datasets.CIFAR10('./data', download=True, transform=transforms.ToTensor(), train=True,)
cifar_test = torchvision.datasets.CIFAR10('./data', download=True, transform=transforms.ToTensor(), train=False,)

cifar_train_loader = DataLoader(cifar_train, shuffle=True, batch_size=batch_size,)
cifar_test_loader = DataLoader(cifar_test, batch_size=batch_size,)

Files already downloaded and verified
Files already downloaded and verified


In [14]:
tuple(cifar_train[0][0].shape)

(3, 32, 32)

## Model

In [None]:
n_filters = 32
C = 3
classes = 10
leak_rate = 0.1

### Residual without AdaptiveMaxPooling

In [None]:
class ResidualBlockE(nn.Module):
    def __init__(self, channels, kernel_size=3, leak_rate=0.1):
        """
        channels: how many channels are in the input/output to this layer
        kernel_size: how large of a filter should we use
        leak_rate: paramter for the LeakyReLU activation function
        """
        super().__init__()
        #how much padding will our convolutional layers need to maintain the input shape
        pad = (kernel_size-1)//2
        
        #Define the conv an BN layers we will use in a sub-network, just 2 hidden layers of conv/BN/activation
        self.F = nn.Sequential(
            nn.Conv2d(channels, channels, kernel_size, padding=pad),
            nn.BatchNorm2d(channels),
            nn.LeakyReLU(leak_rate),
            nn.Conv2d(channels, channels, kernel_size, padding=pad),
            nn.BatchNorm2d(channels),
            nn.LeakyReLU(leak_rate),
        )
    
    def forward(self, x):
        return x + self.F(x) #F() has all the work for the long path, we just add it to the input

In [None]:
class ResidualBottleNeck(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, leak_rate=0.1):
        super().__init__()
        #how much padding will our convolutional layers need to maintain the input shape
        pad = (kernel_size-1)//2
        #The botteneck should be smaller, so output/4 or input. You could also try changing max to min, its not a major issue. 
        bottleneck = max(out_channels//4, in_channels)
        #Define the three sets of BN and convolution layers we need. 
        #Notice that for the 1x1 convs we use padding=0, because 1x1 will not change shape! 
        self.F = nn.Sequential(
            #Compress down
            nn.BatchNorm2d(in_channels),
            nn.LeakyReLU(leak_rate),
            nn.Conv2d(in_channels, bottleneck, 1, padding=0),
            #Normal layer doing a full conv
            nn.BatchNorm2d(bottleneck),
            nn.LeakyReLU(leak_rate),
            nn.Conv2d(bottleneck, bottleneck, kernel_size, padding=pad),
            #Expand back up
            nn.BatchNorm2d(bottleneck),
            nn.LeakyReLU(leak_rate),
            nn.Conv2d(bottleneck, out_channels, 1, padding=0)
        )

        #By default, our shortcut will be the identiy function - which simply returns the input as the output
        self.shortcut = nn.Identity()
        #If we need to change the shape, then lets turn the shortcut into a small layer with 1x1 conv and BM
        if in_channels != out_channels:
            self.shortcut =  nn.Sequential(
                    nn.Conv2d(in_channels, out_channels, 1, padding=0), 
                    nn.BatchNorm2d(out_channels)
                )

    def forward(self, x):
        # shortcut(x) plays the role of "x", do as little work as possible to keep the tensor shapes the same.
        return self.shortcut(x) + self.F(x) 

In [None]:
def resnet_without_AdaptiveMaxPooling(D):
    return nn.Sequential(
        ResidualBottleNeck(C, n_filters), #BottleNeck to start because we need more channels. Its also common to start with just one normal hidden layer before starting residual blocks. 
        nn.LeakyReLU(leak_rate), #We are inserting a activation after each residual. This is optional. 
        ResidualBlockE(n_filters),
        nn.LeakyReLU(leak_rate),
        nn.MaxPool2d((2,2)),
        ResidualBottleNeck(n_filters, 2*n_filters),
        nn.LeakyReLU(leak_rate),
        ResidualBlockE(2*n_filters),
        nn.LeakyReLU(leak_rate),
        nn.MaxPool2d((2,2)),
        ResidualBottleNeck(2*n_filters, 4*n_filters),
        nn.LeakyReLU(leak_rate),
        ResidualBlockE(4*n_filters),
        nn.LeakyReLU(leak_rate),
        nn.Flatten(),
        nn.Linear(D*n_filters//4, classes),
    )

### Residual with AdaptiveMaxPooling

In [None]:
res_net= nn.Sequential(
    ResidualBottleNeck(C, n_filters), #BottleNeck to start because we need more channels. Its also common to start with just one normal hidden layer before starting residual blocks. 
    nn.LeakyReLU(leak_rate), #We are inserting a activation after each residual. This is optional. 
    ResidualBlockE(n_filters),
    nn.LeakyReLU(leak_rate),
    nn.MaxPool2d((2,2)),
    ResidualBottleNeck(n_filters, 2*n_filters),
    nn.LeakyReLU(leak_rate),
    ResidualBlockE(2*n_filters),
    nn.LeakyReLU(leak_rate),
    nn.MaxPool2d((2,2)),
    ResidualBottleNeck(2*n_filters, 4*n_filters),
    nn.LeakyReLU(leak_rate),
    ResidualBlockE(4*n_filters),
    nn.LeakyReLU(leak_rate),
)

In [None]:
classifier = nn.Sequential(
    nn.AdaptiveMaxPool2d((1, 1)),  # Output size 1x1 for each channel
    nn.Flatten(),                  
    nn.Linear(4*n_filters, 2*n_filters), 
    nn.LeakyReLU(leak_rate), 
    nn.Linear(2*n_filters, classes),
    )

In [None]:
def resnet_with_AdaptiveMaxPooling():
    return nn.Sequential(
        res_net,
        classifier,
        )

## Training

In [None]:
loss_func = nn.CrossEntropyLoss()
epochs = 50
params = {
    'device': device,
    'loss_func': loss_func.__class__.__name__,
    'epochs': epochs,
    'batch_size': batch_size,
    }

In [None]:
score_funcs = {"Accuracy": accuracy_score_wrapper}

In [None]:
models = {
    'without_AdaptiveMaxPooling': resnet_without_AdaptiveMaxPooling,
    'with_AdaptiveMaxPooling': resnet_with_AdaptiveMaxPooling
}

In [None]:
data_names = ('mnist', 'cifar')
train_datasets = (mnist_train, cifar_train)
train_dataloaders = (mnist_train_loader, cifar_train_loader)
test_dataloaders = (mnist_test_loader, cifar_test_loader)

In [None]:
for experiment, model in models.items():
    for i in range(len(data_names)):
        train_loader = train_dataloaders[i]
        valid_loader = test_dataloaders[i]
        train_dataset = train_datasets[i]
        data_name = data_names[i]
        model.apply(weight_reset)
        params['experiment'] = experiment
        params['data_name'] = data_name
        optimizer = optim.AdamW(model.parameters())
        params['optimizer'] = optimizer.defaults
        total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
        total_params_all = sum(p.numel() for p in model.parameters())
        params['total_params'] = total_params
        params['total_params_all'] = total_params_all
        with open('model_summary.txt', 'w') as f:
            f.write(str(summary(model, inpt_size=(batch_size,tuple(train_dataset[0][0].shape)))))
        with mlflow.start_run(nested=True, run_name=f'{experiment}_{data_name}'):
            mlflow.log_artifact('model_summary.txt')
            mlflow.log_params(params)

            results = train_network(
                model=model,
                optimizer=optimizer,
                loss_func=loss_func,
                train_loader=train_loader,
                valid_loader=valid_loader,
                epochs=epochs,
                device=device,
                score_funcs=score_funcs
                # checkpoint_file_save='model.pth',
                
            )

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

## Results

How Does Performance Change?
Replacing max-pooling with strided convolutions generally leads to several changes in performance, both in terms of model behavior and computational characteristics:

1. Performance (Accuracy/Segmentation Quality):
Improved Spatial Information Retention: This is the primary benefit. Max pooling is a destructive operation; it discards information by taking only the maximum value in a window. Strided convolutions, on the other hand, learn a transformation. This means the network can learn to extract more meaningful features during downsampling, leading to:
Better Boundary Localization: Finer details are preserved, which is critical for accurate pixel-wise segmentation. The output masks tend to have sharper and more precise boundaries.
Reduced "Gridding" Artifacts: In some upsampling architectures that rely heavily on ConvTranspose2d, replacing pooling with strided convolutions in the encoder can sometimes help reduce checkerboard artifacts in the output, although ConvTranspose2d itself can cause them.
Increased Model Capacity/Complexity: Strided convolutions introduce learnable parameters (weights and biases) where max pooling did not. This increases the model's capacity, allowing it to learn more complex representations. However, this also means:
Potentially More Prone to Overfitting: With more parameters, the model might be more susceptible to overfitting if the dataset is small or regularization is insufficient.
Requires More Data: To effectively train these additional parameters, a larger and more diverse dataset might be beneficial.
2. Computational Performance:
Increased Computational Cost:
More FLOPS (Floating Point Operations): Convolutions are computationally more expensive than simple max-pooling operations. Even though they reduce spatial dimensions, the multiplication and addition operations involved in the convolutions will increase the overall FLOP count per inference.
Increased Memory Usage: While not drastically different, the intermediate feature maps might require slightly more memory depending on the exact kernel and stride choices compared to max-pooling.
Potentially Slower Training/Inference: Due to the increased computational cost, training and inference times will likely increase compared to a max-pooling based U-Net of similar depth.
No "Indifference to Small Translations": Max pooling offers a degree of translational invariance because a small shift in the input might not change the maximum value in a pool. Strided convolutions are more sensitive to input translations, which could be both a benefit (more precise localization) and a drawback (less robust to minor input variations without proper augmentation).
How to "Measure" the Change:
To quantify the performance change, you would need to:

Implement both versions: One with max-pooling and one with strided convolutions.
Train both models: On the same dataset, using the same training pipeline (optimizers, learning rates, epochs, etc.).
Evaluate both models: Using appropriate metrics for your task (e.g., IoU, Dice Score, Pixel Accuracy for segmentation) on a held-out test set.
Compare computational resources: Monitor training time, inference time, and GPU memory usage.
In summary:

Replacing max-pooling with strided convolutions in a U-Net often leads to improved segmentation accuracy, particularly for boundary delineation, due to better spatial information preservation and increased model learning capacity. However, this comes at the cost of increased computational complexity (more FLOPS, potentially longer training/inference times). The trade-off is often worthwhile for tasks demanding high spatial precision.