In [None]:
%load_ext autoreload
%autoreload 2

# Exercise 4

<img src="./images/04.png" width=800>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
import torch.optim as optim
import os
import mlflow
from torchinfo import summary

from torch.utils.data import Dataset, DataLoader, random_split
from utils import train_network, set_seed

  from tqdm.autonotebook import tqdm


In [None]:
os.environ['MLFLOW_TRACKING_URI'] = './mlruns08_4'
mlflow.set_tracking_uri(os.environ.get('MLFLOW_TRACKING_URI'))

In [None]:
mlflow.set_experiment('Exercise08_4')

<Experiment: artifact_location='/home/spakdel/my_projects/Books/Inside-Deep-Learning/Exercises_InsideDeepLearning/Chapter_07/mlruns07_1/143507330168611334', creation_time=1750415411076, experiment_id='143507330168611334', last_update_time=1750415411076, lifecycle_stage='active', name='Exercise07_1', tags={}>

In [None]:
torch.backends.cudnn.deterministic = True
set_seed(42)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Dataset and DataLoader

In [None]:
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import re
import os
from glob import glob

data_url_zip = "https://github.com/kamalkraj/DATA-SCIENCE-BOWL-2018/blob/master/data/stage1_train.zip?raw=true"
path = './data/stage1_train'
os.makedirs(path, exist_ok=True)
resp = urlopen(data_url_zip)
zipfile = ZipFile(BytesIO(resp.read()))
zipfile.extractall(path=path)
paths = glob(path+'/*')

In [None]:
class DSB2018(Dataset):
    """Dataset class for the 2018 Data Science Bowl."""
    def __init__(self, paths):
        """paths: a list of paths to every image folder in the dataset"""
        self.paths = paths
    
    def __len__(self):
        return len(self.paths)
    
    def __getitem__(self, idx):   
        #There is only one image in each images path. So we will grab the "first" thing we find with "[0]" at the end
        img_path = glob(self.paths[idx] + "/images/*")[0]        
        #but there are multiple mask images in each mask path
        mask_imgs = glob(self.paths[idx] + "/masks/*")        
        #the image shape is (W, H, 4), the last dimension is an 'alpha' channel that is not used
        transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor()])
        image = transform(Image.open(img_path).convert('RGB'))
        masks = [transform(Image.open(mask_path).convert('L')) for mask_path in mask_imgs]
        final_mask = masks[0]
            # Perform logical OR with subsequent masks
        for i in range(1, len(masks)):
            final_mask = torch.logical_or(final_mask.bool(), masks[i].bool()).float()
        return image, final_mask 

In [None]:
dns_data = DSB2018(paths)
train_data , test_data = random_split(dns_data, [500, len(dns_data)-500])
batch_size = 16
train_seg_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_seg_loader = DataLoader(test_data,  batch_size=batch_size)

## Model

In [None]:
n_filters = 32
C = 3

### U-Net with Maxpooling

In [None]:
def cnn_layer(in_filters, out_filters, kernel_size=3):
    """
    in_filters: how many channels are in the input to this layer
    out_filters: how many channels should this layer output
    kernel_size: how large should the filters of this layer be
    """
    padding = kernel_size//2
    return nn.Sequential(
        nn.Conv2d(in_filters, out_filters, kernel_size, padding=padding), 
        nn.BatchNorm2d(out_filters),
        nn.LeakyReLU(),
    )

In [None]:
class UNetBlock2d(nn.Module): #Our class extends nn.Module, all PyTorch layers must extend this 
    def __init__(self, in_channels, mid_channels, out_channels=None, layers=1, sub_network=None, filter_size=3):
        """ 
        in_channels: the number of channels in the input to this block
        mid_channels: the number of channels to have as the output for each convolutional filter
        out_channels: if not `None`, ends the network with a 1x1 convolution to convert the number of output channels to a specific number. 
        layers: how many blocks of hidden layers to create on both the input and output side of a U-Net block
        sub_network: the network to apply after shrinking the input by a factor of 2 using max pooling. The number of output channels should be equal to `mid_channels`
        filter_size: how large the convolutional filters should be
        """
        super().__init__()
        
        #Start preparing the layers used to process the input
        in_layers = [cnn_layer(in_channels, mid_channels, filter_size)]
        
        #If we have a sub-network, we will double the number of inputs to the output. So lets figure that out now
        if sub_network is None:
            inputs_to_outputs = 1
        else:
            inputs_to_outputs = 2

        #Preparing the layers used to make the final output, which has extra input channels from any sub-network
        out_layers = [ cnn_layer(mid_channels*inputs_to_outputs, mid_channels, filter_size)]
        
        #Make the additional hidden layers used for the input and output
        for _ in range(layers-1):
            in_layers.append(cnn_layer(mid_channels, mid_channels, filter_size))
            out_layers.append(cnn_layer(mid_channels, mid_channels, filter_size))
        #Use 1x1 Convolutions to ensure a specific output size
        if out_channels is not None:
            out_layers.append(nn.Conv2d(mid_channels, out_channels, 1, padding=0))
    
        #define our three total sub-networks:
        #1) in_model performs the intial rounds of convolution
        self.in_model = nn.Sequential(*in_layers)
        #2) our subnetwork works on the max-pooled result. We will add the pooling and up-scaling directly into the sub-model
        if sub_network is not None:
            self.bottleneck = nn.Sequential(
                nn.MaxPool2d(2), #Shrink
                sub_network, #process the smaller resolution
                #expand back up
                nn.ConvTranspose2d(mid_channels, mid_channels, filter_size, padding=filter_size//2, output_padding=1, stride=2)
            )
        else:
            self.bottleneck = None
        #3) the output model that processes the concatinated result, or just the output from in_model if no sub-network was given
        self.out_model = nn.Sequential(*out_layers)
        
    
    #The forward function is the code that takes an input and produce an output. 
    def forward(self, x):
        #compute the convolutions at the current scale
        full_scale_result = self.in_model(x) #(B, C, W, H)
        #check if we have a bottleneck to apply
        if self.bottleneck is not None:
            #(B, C, W, H) shape because bottleneck does both the pooling & expansion
            bottle_result = self.bottleneck(full_scale_result)
            #Now shape (B, 2*C, W, H)
            full_scale_result = torch.cat([full_scale_result, bottle_result], dim=1)
        #compute the output on the concatenated (or not!) result
        return self.out_model(full_scale_result)
#Caption: A class implementing a “Block” of the U-Net approach. Each block needs to know how many channels are coming in and out of the block. The block has three components. 1) the input network, what processes the raw input coming into this block. 2) the bottleneck, which is what the block runs after shrinking the current results down by a factor of 2, and then expands the result back up to the original size. 3) the output network, which is run on the results from the prior two sub-networks concatenated together. 

In [None]:
unet_with_pooling = nn.Sequential(
    UNetBlock2d(3, 32, layers=2, sub_network=
        UNetBlock2d(32, 64, out_channels=32, layers=2, sub_network=
            UNetBlock2d(64, 128, out_channels=64, layers=2)
        ),
    ),
    #Prediction for _every_ location
    nn.Conv2d(32, 1, (3,3), padding=1), #Shape is now (B, 1, W, H)
)

### U-Net with strided conv (istead of maxpooling)

In [None]:
def cnn_layer(in_filters, out_filters, kernel_size=3):
    """
    in_filters: how many channels are in the input to this layer
    out_filters: how many channels should this layer output
    kernel_size: how large should the filters of this layer be
    """
    padding = kernel_size//2
    return nn.Sequential(
        nn.Conv2d(in_filters, out_filters, kernel_size, padding=padding), 
        nn.BatchNorm2d(out_filters),
        nn.LeakyReLU(),
    )

In [None]:
class UNetBlock2d(nn.Module): #Our class extends nn.Module, all PyTorch layers must extend this 
    def __init__(self, in_channels, mid_channels, out_channels=None, layers=1, sub_network=None, filter_size=3):
        """ 
        in_channels: the number of channels in the input to this block
        mid_channels: the number of channels to have as the output for each convolutional filter
        out_channels: if not `None`, ends the network with a 1x1 convolution to convert the number of output channels to a specific number. 
        layers: how many blocks of hidden layers to create on both the input and output side of a U-Net block
        sub_network: the network to apply after shrinking the input by a factor of 2 using max pooling. The number of output channels should be equal to `mid_channels`
        filter_size: how large the convolutional filters should be
        """
        super().__init__()
        
        #Start preparing the layers used to process the input
        in_layers = [cnn_layer(in_channels, mid_channels, filter_size)]
        
        #If we have a sub-network, we will double the number of inputs to the output. So lets figure that out now
        if sub_network is None:
            inputs_to_outputs = 1
        else:
            inputs_to_outputs = 2

        #Preparing the layers used to make the final output, which has extra input channels from any sub-network
        out_layers = [ cnn_layer(mid_channels*inputs_to_outputs, mid_channels, filter_size)]
        
        #Make the additional hidden layers used for the input and output
        for _ in range(layers-1):
            in_layers.append(cnn_layer(mid_channels, mid_channels, filter_size))
            out_layers.append(cnn_layer(mid_channels, mid_channels, filter_size))
        #Use 1x1 Convolutions to ensure a specific output size
        if out_channels is not None:
            out_layers.append(nn.Conv2d(mid_channels, out_channels, 1, padding=0))
    
        #define our three total sub-networks:
        #1) in_model performs the intial rounds of convolution
        self.in_model = nn.Sequential(*in_layers)
        #2) our subnetwork works on the max-pooled result. We will add the pooling and up-scaling directly into the sub-model
        if sub_network is not None:
            self.bottleneck = nn.Sequential(
                nn.Conv2d(mid_channels, mid_channels, kernel_size=2, stride=2, padding=0), # Downsample
                sub_network, #process the smaller resolution
                #expand back up
                nn.ConvTranspose2d(mid_channels, mid_channels, filter_size, padding=filter_size//2, output_padding=1, stride=2)
            )
        else:
            self.bottleneck = None
        #3) the output model that processes the concatinated result, or just the output from in_model if no sub-network was given
        self.out_model = nn.Sequential(*out_layers)
        
    
    #The forward function is the code that takes an input and produce an output. 
    def forward(self, x):
        #compute the convolutions at the current scale
        full_scale_result = self.in_model(x) #(B, C, W, H)
        #check if we have a bottleneck to apply
        if self.bottleneck is not None:
            #(B, C, W, H) shape because bottleneck does both the pooling & expansion
            bottle_result = self.bottleneck(full_scale_result)
            #Now shape (B, 2*C, W, H)
            full_scale_result = torch.cat([full_scale_result, bottle_result], dim=1)
        #compute the output on the concatenated (or not!) result
        return self.out_model(full_scale_result)
#Caption: A class implementing a “Block” of the U-Net approach. Each block needs to know how many channels are coming in and out of the block. The block has three components. 1) the input network, what processes the raw input coming into this block. 2) the bottleneck, which is what the block runs after shrinking the current results down by a factor of 2, and then expands the result back up to the original size. 3) the output network, which is run on the results from the prior two sub-networks concatenated together. 

In [None]:
unet_with_strideconv = nn.Sequential(
    UNetBlock2d(3, 32, layers=2, sub_network=
        UNetBlock2d(32, 64, out_channels=32, layers=2, sub_network=
            UNetBlock2d(64, 128, out_channels=64, layers=2)
        ),
    ),
    #Prediction for _every_ location
    nn.Conv2d(32, 1, (3,3), padding=1), #Shape is now (B, 1, W, H)
)

## Training

In [None]:
loss_func = nn.BCEWithLogitsLoss()
epochs = 50
params = {
    'device': device,
    'loss_func': loss_func.__class__.__name__,
    'epochs': epochs,
    'batch_size': batch_size,
    }

In [None]:
def iou(outputs: torch.Tensor, labels: torch.Tensor, smooth=1e-6):
    outputs = torch.sigmoid(outputs)
    outputs = (outputs > 0.5).float()
    outputs = outputs.view(outputs.size(0), -1)  # (B, N_pixels)
    labels = labels.view(labels.size(0), -1)    # (B, N_pixels)
    intersection = (outputs * labels).sum(dim=1)  # Element-wise product then sum
    union = (outputs + labels).sum(dim=1) - intersection
    iou = (intersection + smooth) / (union + smooth) # Add smooth to avoid division by zero
    return iou.mean()
score_funcs = {'iou': iou}

In [None]:
models = {
    'unet_with_pooling': unet_with_pooling,
    'unet_with_strideconv': unet_with_strideconv
}

In [None]:
for experiment, model in models.items():
    params['experiment'] = experiment
    optimizer = optim.AdamW(model.parameters())
    params['optimizer'] = optimizer.defaults
    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params_all = sum(p.numel() for p in model.parameters())
    params['total_params'] = total_params
    params['total_params_all'] = total_params_all
    with open('model_summary.txt', 'w') as f:
        f.write(str(summary(model, inpt_size=(batch_size, C, 28, 28))))
    with mlflow.start_run(nested=True, run_name='experiment'):
        mlflow.log_artifact('model_summary.txt')
        mlflow.log_params(params)

        results = train_network(
            model=model,
            optimizer=optimizer,
            loss_func=loss_func,
            train_loader=train_seg_loader,
            valid_loader=test_seg_loader,
            epochs=epochs,
            device=device,
            score_funcs=score_funcs
            # checkpoint_file_save='model.pth',
            
        )

## Results

- Learnable Downsampling: Strided convolutions introduce learnable parameters into the downsampling process. Instead of simply taking the maximum value (as in max-pooling), the network can learn optimal filters to reduce the spatial dimensions while preserving or even enhancing relevant features for the segmentation task. This allows for a more adaptive and data-driven downsampling.

- Information Preservation: Max-pooling is a destructive operation that discards a significant amount of information by only keeping the maximum value in a receptive field. Strided convolutions, being a convolutional operation, process all values in the receptive field, potentially retaining more fine-grained spatial information that can be crucial for precise segmentation boundaries.

- Fully Convolutional Nature: By replacing max-pooling with strided convolutions, the U-Net becomes truly fully convolutional from end-to-end. This can lead to more stable training and better performance, as the entire network is optimized through backpropagation without non-parametric operations breaking the flow of gradient learning.