In [7]:
# Student name: Poulomi Ganguly
# Student ID: 1598887

import os
import time

import torch.nn.functional as F
import torch
from torch import nn
from torchvision import models
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

#Edit root_dir here to point where the TextureImagesDataset folder 
root_dir = '/content/drive/My Drive/CMPUT328/Semantic Segmentation'

class TextureImages(object):
    def __init__(self, subset='train', batch_size=64, shuffle=True):
        if subset == 'train':
            images = np.load(os.path.join(root_dir, 'TextureImagesDataset',
                                          'train_images.npy'))
            masks = np.load(os.path.join(root_dir, 'TextureImagesDataset',
                                         'train_masks.npy'))
        elif subset == 'test':
            images = np.load(os.path.join(root_dir, 'TextureImagesDataset',
                                          'test_images.npy'))
            masks = np.load(os.path.join(root_dir, 'TextureImagesDataset',
                                         'test_masks.npy'))
        else:
            raise NotImplementedError
        self._images = images
        self.images = self._images
        self._masks = masks
        self.masks = self._masks
        self.batch_size = batch_size
        self.num_samples = len(self.images)
        self.shuffle = shuffle
        if self.shuffle:
            self.shuffle_samples()
        self.next_batch_pointer = 0

    def shuffle_samples(self):
        image_indices = np.random.permutation(np.arange(self.num_samples))
        self.images = self._images[image_indices]
        self.masks = self._masks[image_indices]

    def get_next_batch(self):
        num_samples_left = self.num_samples - self.next_batch_pointer
        if num_samples_left >= self.batch_size:
            x_batch = self.images[self.next_batch_pointer:self.next_batch_pointer + self.batch_size]
            y_batch = self.masks[self.next_batch_pointer:self.next_batch_pointer + self.batch_size]
            self.next_batch_pointer += self.batch_size
        else:
            x_partial_batch_1 = self.images[self.next_batch_pointer:self.num_samples]
            y_partial_batch_1 = self.masks[self.next_batch_pointer:self.num_samples]
            if self.shuffle:
                self.shuffle_samples()
            x_partial_batch_2 = self.images[0:self.batch_size - num_samples_left]
            y_partial_batch_2 = self.masks[0:self.batch_size - num_samples_left]
            x_batch = np.vstack((x_partial_batch_1, x_partial_batch_2))
            y_batch = np.vstack((y_partial_batch_1, y_partial_batch_2))
            self.next_batch_pointer = self.batch_size - num_samples_left
        return x_batch, y_batch

class CrossEntropyLoss2d(nn.Module):
    def __init__(self, weight=None, size_average=True, ignore_index=255):
        super(CrossEntropyLoss2d, self).__init__()
        self.nll_loss = nn.NLLLoss(weight, size_average, ignore_index)

    def forward(self, inputs, targets):
        return self.nll_loss(F.log_softmax(inputs, dim=1), targets)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
def SemSeg(input_size, num_classes=5):
    # TODO: Implement Semantic Segmentation network here
    # Returned logits must be a tensor of size:
    # (<batch_size>, image_height, image_width, num_classes + 1)
    # 1st dimension is batch dimension
    # image_height and image_width are the height and width of input_tensor
    # last dimension is the softmax dimension. There are 4 texture classes plus 1 background class
    # therefore last dimension will be 5
    # ----------------------------------------------------------------------------------------------
    
    # Declaring a class for convolution between any two specified channels
    # Accomodates down convolutions required for this architecture 

    # Initialization of a ConvLayer has the following input parameters:
    # * in_ch: Channels in the layer that is going through convolution
    # * out_ch: Desired channels after convolution

    # Explanation of working: Upon initializing ConvLayer, nn.Sequential
    # carries out 2d convolution based on specified input and output 
    # channels with a filter of size 3x3, padding of size 1 and a default
    # stride of size 1
    # The use of BatchNorm2d ensured a more stable training phase with
    # consistent drop in loss and less variations in time taken to train
    # each batch
    # Next, ReLU is taken in place to finish the convolution layer. 

    # The forward function of the convolution layer returns the result of the
    # prior operations based on given input.

    # Note: Padding of 1 helps with double convolutions on an image of size
    # 196 x 196 dimension. The stride size of 1 limits the padding amount to
    # 0 and 1. After testing with both, padding = 1 ensures zero data loss at
    # maxpool layer.
    class ConvLayer(nn.Module):
      def __init__(self, in_ch, out_ch):
          super(ConvLayer, self).__init__()
          self.conv = nn.Sequential(
              nn.Conv2d(in_ch, out_ch, kernel_size=3, padding = 1),
              nn.BatchNorm2d(out_ch),
              nn.ReLU(inplace=True),
          )
      def forward(self, input):
          return self.conv(input)


    # This class is loosely based on the UNet architechture for semantic
    # segmentation, however it does not involve the cropping of previous down
    # convolution layers as well as has less down and upsampling layers in general
    # to accomodate the smaller dimensions compared to that of the original paper.
    # i.e. 3 x 196 x 196 rather than 1 x 572 x 572

    # Explanation of Network + changes made:

    # https://miro.medium.com/max/2824/1*f7YOaE4TWubwaFF7Z1fzNw.png
    # According to the above figure, UNet relies on a series of down convolutions
    # followed by up convolutions to make a fully convolutional network model 
    # which achieves semantic segmentation due to an hourglass shaped stack of layers.

    # This implies that the input image is downsampled several times through a sequence 
    # of double down convolutions and maxpool functions and then upsampled through the
    # opposite of the same process (transposed convolution) in order to restore its original
    # dimension

    # Initialization: 
    # The initialization for this network defines all the required down convolution, maxpool
    # Transpose convolution and up convolution layers. These are called in the forward function
    # This initialization takes in the following parameters.
    # * in_ch: Channels in the layer that is going through convolution
    # * out_ch: Desired no. of channels after convolution


    class UNet(nn.Module):
      def __init__(self, in_ch, out_ch):
        super(UNet, self).__init__()

        self.Conv_Down_1 = ConvLayer(in_ch, 32)
        self.Conv_Down_2 = ConvLayer(32, 32)
        self.max_pool = nn.MaxPool2d(kernel_size = 2, stride = 2)

        self.Conv_Down_3 = ConvLayer(32, 64)
        self.Conv_Down_4 = ConvLayer(64, 64)

        self.Conv_Down_5 = ConvLayer(64, 128)

        self.TransConv_Up_1 = nn.ConvTranspose2d(128, 64, 2, stride=2)
        self.Conv_Up_1 = ConvLayer(128, 64)

        self.TransConv_Up_2 = nn.ConvTranspose2d(64, 32, 2, stride=2)
        self.Conv_Up_2 = ConvLayer(64, 32)

        self.Last_Conv_Up = nn.Conv2d(32, out_ch, 1)
        
      def forward(self, x):
        
        # ------ Downsampling ----------
        # Starting off with a batch of 3 x 196 x 196 images, the first down
        # convolution results in c1 becoming a layer with 32 channels and
        # have height and width remain the same
        # Thus, size of c1 is 32 x 196 x 196 (D x W x H)

        # Next, another convolution takes place which retains this dimension
        # Thus, size of c2 is also 32 x 196 x 196 (D x W x H). 
        # This double convolution resulted in further loss in dimension in the original
        # paper, however the parameters of the convolution has allowed for
        # the size to stay same. 

        # This layer is then maxpooled and the input dimensions are effectively halved.
        # This results in lesser parameters for computation for further downsampling.
        # However, note that the number of channels from before will stay the same.
        # Size of p1: 32 x 98 x 98

        c1 = self.Conv_Down_1(x)
        c2 = self.Conv_Down_2(c1)
        p1 = self.max_pool(c2)

        # Another set of double down convolution is performed. However, output 
        # channels are doubled. 32 -> 64 channels.
        # The height and width of the result of these down convolution stays same
        # as before
        # Thus, the new sizes as result of down convolutions is:
        # Size of c3 = size of c4 = 64 x 98 x 98
        # Maxpooling these layers further reduces height and width down to 
        # 49 each. 
        # Size of p2 = 64 x 49 x 49
        
        c3 = self.Conv_Down_3(p1)
        c4 = self.Conv_Down_4(c3)
        p2 = self.max_pool(c4)

        # At this point in the paper, an additional two sets of double convolutions
        # + maxpooling is performed to obtain a total of 1024 channels. However
        # this contributes significantly to training time and thus I have stopped at
        # 128 channels before moving on to upsampling. I have not gone through
        # with a double convolution of the last layer (unlike original paper) to cut 
        # down on training time however, this is a relatively miniscule difference. 

        # This results in c5 having a size of 128 x 49 x 49

        c5 = self.Conv_Down_5(p2)

        # ------ Upsampling ----------

        # Upsampling involves upwards transposed convolutions as well as upwards double
        # convolutions. Unlike the original paper, I have used single upwards convolutions
        # rather than double and have still achieved high accuracy. I assume that larger
        # resolutions would be helped by these layers of double convolutions in order
        # to achieve segmentation closer to ground truth

        # Starting off by a transposed convolution layer on c5 which was of size
        # 128 x 49 x 49, we get up1 which has the half the number of channels and
        # double the height and width. This required some experimentation to get an 
        # exact double. Alternatively, UpSampling2D can also perform the same operation
        
        # Size of up1 64 x 98 x 98

        # https://towardsdatascience.com/understanding-semantic-segmentation-with-unet-6be4f42d4b47
        # "To get better precise locations, at every step of the decoder we use skip connections by 
        # concatenating the output of the transposed convolution layers with the feature maps from 
        # the Encoder at the same level:"

        # Size of c6 64 x 98 x 98 (dimensions are preserved)

        up1 = self.TransConv_Up_1(c5)
        c6 = self.Conv_Up_1(torch.cat([up1,c4],dim=1))

        # Another set of transposed convolution + concatanation with feature map
        # from downsampled layer at same level is carried out. This results in 
        # number of channels halving again and the input height and width being doubled to 196
        # This step has thus restored original dimension.

        # Size of up2 = size of c7 = 32 x 196 x 196
        up2 = self.TransConv_Up_2(c6)
        c7 = self.Conv_Up_2(torch.cat([up2,c2],dim=1))

        # Unlike the papers last double convolution, I have carried out one last
        # up convolution to filter the number of channels down to required number
        # of output channels. Alternatively, this is also the number classes we
        # need the returned logit to have (5)

        # Size of c8 is num_classes x 196 x 196
        c8 = self.Last_Conv_Up(c7)        

        # Return softmax of last layer to normalize output to something resembling 
        # a probability distribution (Thus lies between 0 and 1). Alternatively, a
        # sigmoid function may also be used.
        return nn.Softmax()(c8)
    
    # Initializing the UNet network with 3 input channels and number of classes
    # expected at the end after softmax layer. 
    model = UNet(3, num_classes)
    return model




In [10]:
def run():
    # You can tune the hyperparameters here.
    n_epochs = 2 # 2 epochs are enough to bring accuracy to 95%
    batch_size = 16 # Batch size determines number of images being passed in to network
    learning_rate = 0.001 # L2 regularization
    weight_decay = 0.001 
    use_cuda = 1

    load_weights = 0
    wts_fname = 'model.pt'

    input_size = (196, 196)
    n_batches = int(2000 / batch_size)
    wts_path = os.path.join(root_dir, wts_fname)

    if use_cuda and torch.cuda.is_available():
        device = torch.device("cuda")
        print('Training on GPU: {}'.format(torch.cuda.get_device_name(0)))
    else:
        device = torch.device("cpu")
        print('Training on CPU')

    train_set = TextureImages('train', batch_size=batch_size)
    test_set = TextureImages('test', shuffle=False)

    model = SemSeg(input_size).to(device)

    def evaluation(images, true_labels):
        eval_batch_size = 100
        predicted_labels = []
        model.eval()
        with torch.no_grad():
            for start_index in range(0, len(images), eval_batch_size):
                end_index = start_index + eval_batch_size
                batch_x = images[start_index: end_index]
                # batch_x = np.reshape(batch_x, (batch_x.shape[0], 3, 196, 196))
                batch_x = torch.FloatTensor(batch_x).permute((0, 3, 1, 2)).to(device)
                print(batch_x.size)
                batch_predicted_logits = model(batch_x)
                batch_predicted_labels = torch.argmax(batch_predicted_logits, axis=1)
                batch_predicted_labels = batch_predicted_labels.cpu().numpy()
                predicted_labels += list(batch_predicted_labels)
        predicted_labels = np.vstack(predicted_labels).flatten()
        true_labels = true_labels.flatten()
        accuracy = float((predicted_labels == true_labels).astype(np.int32).sum()) / true_labels.size
        return predicted_labels, accuracy

    if load_weights:      
        print('Loading weights from: {}'.format(wts_path))
        chkpt = torch.load(wts_path, map_location=device)  # load checkpoint
        model.load_state_dict(chkpt['model'])
    else:
        criterion = CrossEntropyLoss2d().to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

        print("Training...")
        mean_loss = 0
        steps = 0
        losses = []
        max_accuracy = 0
        max_accuracy_id = 0
        for epch_id in range(n_epochs):
            model.train()
            for batch_id in range(n_batches):
                start_t = time.time()

                batch_x, batch_y = train_set.get_next_batch()

                batch_x = torch.FloatTensor(batch_x).permute((0, 3, 1, 2)).to(device)
                batch_y = torch.LongTensor(batch_y).squeeze().to(device)

                optimizer.zero_grad()

                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

                end_t = time.time()

                _loss = loss.item()
                steps += 1
                mean_loss += (_loss - mean_loss) / steps
                losses.append(_loss)

                time_taken = end_t - start_t

                print('batch: {} / {} loss: {} mean_loss: {} time_taken: {}'.format(
                    batch_id, n_batches, _loss, mean_loss, time_taken))

            _, test_accuracy = evaluation(test_set._images, test_set._masks)
            if test_accuracy > max_accuracy:
                max_accuracy = test_accuracy
                max_accuracy_id = epch_id
                chkpt = {
                    'model': model.state_dict(),
                }
                torch.save(chkpt, '{}.{}'.format(wts_path, max_accuracy_id))
            print("epch {} / {}: Test Pixel Accuracy = {:.3f} max_accuracy = {:.3f} in epoch {}".format(
                epch_id + 1, n_epochs, test_accuracy, max_accuracy, max_accuracy_id + 1))
        print("Done training. Weights saved to: {}".format(wts_fname))
        chkpt = {
            'model': model.state_dict(),
        }
        torch.save(chkpt, wts_path)

    print('Evaluating on test set')
    _, test_accuracy = evaluation(test_set._images, test_set._masks)
    print("Test Pixel Accuracy = {:.3f}".format(test_accuracy))
    return test_accuracy


if __name__ == '__main__':
    run()


Training on CPU




Training...




batch: 0 / 125 loss: 1.602805495262146 mean_loss: 1.602805495262146 time_taken: 8.53781008720398
batch: 1 / 125 loss: 1.5546015501022339 mean_loss: 1.57870352268219 time_taken: 7.88772988319397
batch: 2 / 125 loss: 1.5108996629714966 mean_loss: 1.5561022361119587 time_taken: 7.96498441696167
batch: 3 / 125 loss: 1.4861730337142944 mean_loss: 1.5386199355125427 time_taken: 7.902710914611816
batch: 4 / 125 loss: 1.4675447940826416 mean_loss: 1.5244049072265624 time_taken: 7.696413516998291
batch: 5 / 125 loss: 1.4378799200057983 mean_loss: 1.5099840760231018 time_taken: 7.649884939193726
batch: 6 / 125 loss: 1.4389235973358154 mean_loss: 1.4998325790677751 time_taken: 7.911668539047241
batch: 7 / 125 loss: 1.4165600538253784 mean_loss: 1.4894235134124756 time_taken: 7.729214906692505
batch: 8 / 125 loss: 1.394759178161621 mean_loss: 1.4789052539401584 time_taken: 7.639113903045654
batch: 9 / 125 loss: 1.3877413272857666 mean_loss: 1.4697888612747192 time_taken: 7.713183403015137
batch: 1

#Possible Output 
Training on GPU: Tesla K80

Training...

epch 1 / 25: Test Pixel Accuracy = 0.865 max_accuracy = 0.865 in epoch 1

epch 2 / 25: Test Pixel Accuracy = 0.945 max_accuracy = 0.945 in epoch 2

epch 3 / 25: Test Pixel Accuracy = 0.938 max_accuracy = 0.945 in epoch 2

epch 4 / 25: Test Pixel Accuracy = 0.975 max_accuracy = 0.975 in epoch 4

epch 5 / 25: Test Pixel Accuracy = 0.981 max_accuracy = 0.981 in epoch 5

epch 6 / 25: Test Pixel Accuracy = 0.982 max_accuracy = 0.982 in epoch 6

epch 7 / 25: Test Pixel Accuracy = 0.583 max_accuracy = 0.982 in epoch 6

epch 8 / 25: Test Pixel Accuracy = 0.959 max_accuracy = 0.982 in epoch 6

epch 9 / 25: Test Pixel Accuracy = 0.762 max_accuracy = 0.982 in epoch 6

epch 10 / 25: Test Pixel Accuracy = 0.864 max_accuracy = 0.982 in epoch 6

epch 11 / 25: Test Pixel Accuracy = 0.941 max_accuracy = 0.982 in epoch 6

epch 12 / 25: Test Pixel Accuracy = 0.963 max_accuracy = 0.982 in epoch 6

epch 13 / 25: Test Pixel Accuracy = 0.954 max_accuracy = 0.982 in epoch 6

epch 14 / 25: Test Pixel Accuracy = 0.821 max_accuracy = 0.982 in epoch 6

epch 15 / 25: Test Pixel Accuracy = 0.846 max_accuracy = 0.982 in epoch 6

epch 16 / 25: Test Pixel Accuracy = 0.967 max_accuracy = 0.982 in epoch 6

epch 17 / 25: Test Pixel Accuracy = 0.945 max_accuracy = 0.982 in epoch 6

epch 18 / 25: Test Pixel Accuracy = 0.971 max_accuracy = 0.982 in epoch 6

epch 19 / 25: Test Pixel Accuracy = 0.985 max_accuracy = 0.985 in epoch 19

epch 20 / 25: Test Pixel Accuracy = 0.980 max_accuracy = 0.985 in epoch 19

epch 21 / 25: Test Pixel Accuracy = 0.986 max_accuracy = 0.986 in epoch 21

epch 22 / 25: Test Pixel Accuracy = 0.988 max_accuracy = 0.988 in epoch 22

epch 23 / 25: Test Pixel Accuracy = 0.989 max_accuracy = 0.989 in epoch 23

epch 24 / 25: Test Pixel Accuracy = 0.982 max_accuracy = 0.989 in epoch 23

epch 25 / 25: Test Pixel Accuracy = 0.987 max_accuracy = 0.989 in epoch 23

Done training. Weights saved to: model.pt

Evaluating on test set

Test Pixel Accuracy = 0.987