# Hand Sign Recognition with CNN

Using a Convolutional Neural Network for recognizing hand signs.

_PyTorch implementation of the assignment of Course 4 of Coursera's Deep Learning Specialization_.

## Imports

In [None]:
%matplotlib inline

import h5py
import matplotlib.pyplot as plt
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.utils as vutils

from torch.utils.data import DataLoader, Dataset

## Reproducibility

In [None]:
seed = 24
random.seed(seed)
torch.manual_seed(seed);

## Dataset

We will create a custom dataset that can load the images from the given `.h5` files. The files contain:

- 1080 training examples
- 120 testing examples
- each image is of size: (64, 64, 3)
- there are 6 different signs (numbers 0 to 5)

In [None]:
class HandSignsDataset(Dataset):
    def __init__(self, train, transform=None):
        prefix = 'train' if train else 'test'
        dataset = h5py.File(f'data/{prefix}_signs.h5', 'r')

        self.X = np.array(dataset[f'{prefix}_set_x'][:])
        self.y = np.array(dataset[f'{prefix}_set_y'][:])
        self.transform = transform

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        X = self.transform(self.X[idx, :]) if self.transform else self.X[idx, :]
        y = self.y[idx]

        return X, y

We can now create training and test sets. We use `ToTensor()` to convert the images to a PyTorch tensor with shape channels x height x width, and with pixel values rescaled from [0, 255] to [0, 1].

In [None]:
training = DataLoader(HandSignsDataset(train=True, transform=transforms.ToTensor()), batch_size=64, shuffle=True, num_workers=4)
testing = DataLoader(HandSignsDataset(train=False, transform=transforms.ToTensor()), batch_size=64, shuffle=True, num_workers=4)

Let's look at some examples from the training set.

In [None]:
batch = next(iter(training))
plt.figure(figsize=(10, 10))
plt.axis('off')
plt.title('Training Images')
# We need to swap the axes to (height, width, channels) for matplotlib to plot the image correctly
plt.imshow(np.transpose(vutils.make_grid(batch[0], padding=2, normalize=True), (1, 2, 0)));

## Model

We'll use a Convolutional Neural Network as follows:

- Hidden Layer 1: Conv -> ReLU
  - in_channels=3 (RBG)
  - filters=8
  - filter_size=4
  - stride=1
  - padding=same
- Hidden Layer 2: MaxPool
  - filter_size=8
  - stride=8
  - padding=same
- Hidden Layer 3: Conv -> ReLU
  - in_channels=8
  - filters=16
  - filter_size=2
  - stride=1
  - padding=same
- Hidden Layer 4: MaxPool
  - filter_size=4
  - stride=4
  - padding=same
- Flatten Layer: Convert volume of size 16 x 63 x 63 (due to the "floor" operation the original size gets reduced to 63 even with _same_ padding) to vectors of size 63,504
- Output Layer: Linear -> Softmax with 6 units

To compute the padding needed for _same_ convolutions we can use the following formula (derived from the formula for the output height/width):

$$p = \frac{(h-1) \cdot s - h + f}{2}$$

where $h$ is the height (can also be substituted for the width), $s$ is the stride and $f$ is the filter size.

_See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html#torch.nn.Conv2d_

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        self.net = nn.Sequential(
            # Hidden Layer 1
            nn.Conv2d(in_channels=3, out_channels=8, kernel_size=4, stride=1, padding=1),
            nn.ReLU(inplace=True),
            # Hidden Layer 2
            # We need to add a padding layer because the padding is larger than the kernel size which causes `MaxPool2d` to fail
            nn.ZeroPad2d(padding=224),
            nn.MaxPool2d(kernel_size=8, stride=8, padding=0),
            # Hidden Layer 3
            nn.Conv2d(in_channels=8, out_channels=16, kernel_size=2, stride=1, padding=0),
            nn.ReLU(inplace=True),
            # Hidden Layer 4
            # We need to add a padding layer because the padding is larger than the kernel size which causes `MaxPool2d` to fail
            nn.ZeroPad2d(padding=96),
            nn.MaxPool2d(kernel_size=4, stride=4, padding=0),
            nn.Flatten(),
            # Output layer
            nn.Linear(in_features=63504, out_features=6),
            # We don't add the softmax activation function here because it will be combined with the loss function
        )

    def forward(self, input):
        return self.net(input)

In [None]:
model = Model()
print(model)

## Optimization

We'll use Adam for training the model with the cross-entropy loss.

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.009)

## Training

Time to train the model!

In [None]:
# Training configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
epochs = 100

In [None]:
# Main training loop
model = model.to(device)
total_losses = []

for epoch in range(epochs):
    losses = []

    for samples, targets in training:
        optimizer.zero_grad()
        samples = samples.to(device)
        targets = targets.to(device)

        output = model(samples)
        loss = loss_fn(output, targets)

        loss.backward()
        optimizer.step()
        
        # Detach the loss to avoid saving any more computations on it
        losses.append(loss.detach().cpu().numpy())

    if epoch % 5 == 0:
        total_losses.append(np.mean(losses))
        print(f'[{epoch}/{epochs}]\tLoss: {total_losses[-1]}')

# Save the model
torch.save(model.state_dict(), 'model/model.pt')

# Plot the losses per tens of epochs
plt.plot(total_losses)
plt.ylabel('Avg. Loss')
plt.xlabel('Epochs (per tens)')
plt.title('Avg. Loss per Epoch')
plt.show()