Before running: first select runtime type as GPU, then mount Google Drive.
Always choose "run all" as much of the python code relies on previous cells.


Data prerequisites: the same as in Chapter 2's notebook, kindly refer to that chapter to download the image dataset if not done.

Firstly, mount Google Drive:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# PIL is a widely used python image library
import PIL
from PIL import Image, ImageFile

import torchvision
from torchvision import transforms
from torch.utils import data

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models

# this is to prevent image file to be too large
ImageFile.LOAD_TRUNCATED_IMAGES = True 

workspace_path = "./drive/MyDrive/image_classification_test/"
train_data_path = workspace_path + "/train/"
val_data_path = workspace_path + "/val/"
test_data_path = workspace_path + "/test/"

# check if this image exists
img = Image.open(val_data_path + "/fish/100_1422.JPG")
print(img.size)

(512, 342)


Run the following to set train_data, test_data and validation_data. The torchvision helps do the image preprocessing here. Search for knowledge about train set, test set and validation set if needed.

In [3]:
# this function is very important, otherwise quite a few images won't be opened successfully, causing script's runtime error.
def check_image(path):
  try:
    Image.open(path)
    return True
  except:
    return False

img_transforms = transforms.Compose([
    transforms.Resize((64, 64)), # resize image
    transforms.ToTensor(), # store image data in tensor
    transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
    # the above normalization follows distribution of ImageNet dataset
    ])

train_data = torchvision.datasets.ImageFolder(root = train_data_path, transform = img_transforms, is_valid_file=check_image)

val_data = torchvision.datasets.ImageFolder(root = val_data_path, transform = img_transforms, is_valid_file=check_image)

test_data = torchvision.datasets.ImageFolder(root = test_data_path, transform = img_transforms, is_valid_file=check_image)

# load data in a batch
batch_size = 64
train_data_loader = data.DataLoader(train_data, batch_size = batch_size)
val_data_loader = data.DataLoader(val_data, batch_size = batch_size)
test_data_loader = data.DataLoader(test_data, batch_size = batch_size)
# check how many images get loaded
print(len(train_data_loader.dataset))
print(len(val_data_loader.dataset))
print(len(test_data_loader.dataset))

803
110
160


Now we are going to define a new network, convolutional neural network (a simplied version of AlexNet. Hold on for a while if you haven't heard of AlexNet.)

In [4]:
class cnn_net(nn.Module):
    # num_classes is for how many target categories for image classification
    def __init__(self, num_classes=2):
        super(cnn_net, self).__init__()
        # search for nn.Sequential for more info if needed; basically it can be used
        # as a block of layers, used to build a larger network
        rgb_channel_num = 3
        final_output_feature = 256
        self.features = nn.Sequential(
            # search for Conv2d's parameters like stride, padding for more info
            # check this https://stackoverflow.com/a/49818579 for knowledge about how are 
            # conv2d weights are initialized by default
            nn.Conv2d(rgb_channel_num, 64, kernel_size = 11, stride = 4, padding = 2),
            # introduced in Chapter 2
            nn.ReLU(),
            # search for MaxPool2d for more info if needed
            nn.MaxPool2d(kernel_size = 3, stride = 2),
            # note the relationship between each layer's output channel number and
            # the next layer's input channel number
            nn.Conv2d(64, 192, kernel_size = 5, padding = 2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2),
            nn.Conv2d(192, 384, kernel_size = 3, padding = 1),
            nn.ReLU(),
            nn.Conv2d(384, 256, kernel_size = 3, padding = 1),
            nn.ReLU(),
            nn.Conv2d(256, final_output_feature, kernel_size = 3, padding = 1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2),
        )
        # search for AdaptiveAvgPool2d for more info if needed
        avg_pool_kernel_size = (6, 6)
        self.avg_pool = nn.AdaptiveAvgPool2d(avg_pool_kernel_size)
        self.classifier = nn.Sequential(
            # search for Dropout for more info if needed, an important method for 
            # model generalisation
            # note the affect of dropout ratio here, which is 0.5 by default.
            # Also note the behavioral difference for Dropout layer between training mode 
            # and eval mode
            nn.Dropout(),
            # search for nn Linear's definition if needed
            # note the relationship between each layer's output channel number and
            # the next layer's input channel number
            nn.Linear(final_output_feature * avg_pool_kernel_size[0] * avg_pool_kernel_size[1], 4096),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            # note the final output layer should match num_classes
            nn.Linear(4096, num_classes)
        )

    # note how the Sequentials defined above are used in forwarding
    def forward(self, x):
        x = self.features(x)
        x = self.avg_pool(x)
        # reshape into one dimension
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

The following are very similar to Chapter 2: defining the loss function, training and run prediction.

In [5]:
my_model = cnn_net()
# search for Adam optimizer for more info if needed
# note learning rate 3e-4 is quite famous, as it was recommended widely in academic research
# using a even smaller learning rate has a better chance to learn with small dataset
# using a larger learning rate would cause the network to learn very slowly
optimizer = optim.Adam(my_model.parameters(), lr = 0.0001)

# copy the model to device
if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
my_model.to(device)

def loss_update(model, batch, loss_fn, device, check_result):
    inputs, targets = batch
    inputs = inputs.to(device)
    targets = targets.to(device)
    output = model(inputs)
    loss = loss_fn(output, targets)
    num_current_correct = 0
    if (check_result):
        result = torch.eq(torch.max(F.softmax(output, dim = 1), dim = 1)[1], targets)
        num_current_correct = torch.sum(result).item()
    return loss, num_current_correct

def train(model, optimizer, loss_fn, train_data_loader, val_data_loader, epochs, device):
  for epoch in range(epochs):
    training_loss = 0.0
    valid_loss = 0.0
    # this is to set model in training mode
    model.train()
    check_result = False
    # training process
    for batch in train_data_loader:
      optimizer.zero_grad()
      loss = loss_update(model, batch, loss_fn, device, check_result)[0]
      loss.backward()
      optimizer.step()
      training_loss += loss.data.item() * batch[0].shape[0]
    training_loss /= len(train_data_loader.dataset)

    # this is to set model in evaluation mode
    model.eval()
    check_result = True
    num_correct = 0
    num_examples = 0
    for batch in val_data_loader:
      loss, num_current_correct= loss_update(model, batch, loss_fn, device, check_result)
      valid_loss += loss.data.item() * batch[0].shape[0]
      num_correct += num_current_correct
      num_examples += batch[0].shape[0]
    valid_loss /= len(val_data_loader.dataset)

    print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}, accuracy = {:.2f}'.format(epoch, training_loss, valid_loss, num_correct / num_examples))

# modify the number of epochs to check how loss and accuracy changes with more training
# note this model needs a few more epochs to learn
train(my_model, optimizer, torch.nn.CrossEntropyLoss(), train_data_loader, val_data_loader, 100, device)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch: 0, Training Loss: 4.80, Validation Loss: 0.65, accuracy = 0.79
Epoch: 1, Training Loss: 0.68, Validation Loss: 0.69, accuracy = 0.79
Epoch: 2, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 3, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 4, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 5, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 6, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 7, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 8, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 9, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 10, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 11, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 12, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 13, Training Loss: 0.69, Validation Loss: 0.69, accuracy = 0.79
Epoch: 14, Train

In [6]:
labels = ['cat', 'fish']
img = Image.open(test_data_path + "/fish/wilderness_beach.jpg")
img = img_transforms(img).to(device)
img = torch.unsqueeze(img, 0)
my_model.eval()
prediction = F.softmax(my_model(img), dim = 1)
print("prediction = ")
print(prediction)
prediction = prediction.argmax()
print("the prediction result is " + labels[prediction])

prediction = 
tensor([[9.9365e-24, 1.0000e+00]], grad_fn=<SoftmaxBackward>)
the prediction result is fish


Saving the model.

In [7]:
# note the path need to have a suffix like ".pth", 
# otherwise it may encounter a file writing failure
model_path = workspace_path + "/cnn_net_dict.pth"
# note saving state dict is usually a better way to save model,
# search for more info if needed
torch.save(my_model.state_dict(), model_path)

Use the following code to get a few famous CNN models and print the structures of them. Search for more info about each model's design (even paper), contribution could be interesting. Visit https://pytorch.org/vision/stable/models.html to check a few stable models available from torchvision.

In [8]:
# AlexNet
alexnet = models.alexnet(num_classes=1000, pretrained=True)
# print model structure
print("alexnet's structure")
print(alexnet)

Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth


HBox(children=(FloatProgress(value=0.0, max=244408911.0), HTML(value='')))


alexnet's structure
AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_featur

In [9]:
# Inception
# https://pytorch.org/hub/pytorch_vision_inception_v3/
inception_v3 = models.inception_v3(pretrained=True)
print("inception_v3's structure")
print(inception_v3)

Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth


HBox(children=(FloatProgress(value=0.0, max=108949747.0), HTML(value='')))


inception_v3's structure
Inception3(
  (Conv2d_1a_3x3): BasicConv2d(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2a_3x3): BasicConv2d(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_2b_3x3): BasicConv2d(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (Conv2d_3b_1x1): BasicConv2d(
    (conv): Conv2d(64, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(80, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (Conv2d_4a_3x3): BasicConv2d(
    (conv): Conv2d(80, 192

In [10]:
# VGG
vgg16 = models.vgg16()
print("vgg16's structure")
print(vgg16)

vgg16's structure
VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, pad

In [11]:
# ResNet
# torch.hub is a newer way to get open source model
# However, it can be tiring to download large model from hub to Colab
# as can be seen in issue https://github.com/pytorch/vision/issues/4156

# https://pytorch.org/hub/pytorch_vision_resnet/
resnet50 = torch.hub.load('pytorch/vision', 'resnet50')
print("resnet50's structure")
# search for and learn BatchNorm's role in this structure
print(resnet50)

Downloading: "https://github.com/pytorch/vision/archive/master.zip" to /root/.cache/torch/hub/master.zip


resnet50's structure
ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_siz