In [2]:
# Here we import everything we need for the project

%matplotlib inline
import os, time
import csv

# pytorch
import torch
from torch.nn import Module, Conv2d, MaxPool2d, Linear
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pandas as pd

# Sklearn
from sklearn.model_selection import train_test_split # Helps with organizing data for training
from sklearn.metrics import confusion_matrix, classification_report # Helps present results as a confusion-matrix

## Loading Data

This project uses the [Hand Gesture Recognition Database](https://www.kaggle.com/gti-upm/leapgestrecog/version/1) (citation below) available on Kaggle. It contains 20000 images with different hands and hand gestures. There is a total of 10 hand gestures of 10 different people presented in the dataset. There are 5 female subjects and 5 male subjects.
The images were captured using the Leap Motion hand tracking device.

>Hand Gesture | Label used
>--- | ---
> Thumb down | 0
> Palm (Horizontal) | 1
> L | 2
> Fist (Horizontal) | 3
> Fist (Vertical) | 4
> Thumbs up | 5
> Index | 6
> OK | 7
> Palm (Vertical) | 8
> C | 9

Table 1 - Classification used for every hand gesture.


T. Mantecón, C.R. del Blanco, F. Jaureguizar, N. García, “Hand Gesture Recognition using Infrared Imagery Provided by Leap Motion Controller”, Int. Conf. on Advanced Concepts for Intelligent Vision Systems, ACIVS 2016, Lecce, Italy, pp. 47-57, 24-27 Oct. 2016. (doi: 10.1007/978-3-319-48680-2_5)  

Overview:
- Load images
- Some validation
- Preparing the images for training
- Use of train_test_split

In [3]:
# Unzip images, ignore this cell if files are already in the workspace
#!unzip leapGestRecog.zip

## Writing CSV
Since this dataset doesn't come with a nice csv, write one ourselves to make loading the data easier later

In [4]:
header = ["path_to_file", "class/GT"]
csv_path = "kaggle_images.csv"
# We need to get all the paths for the images to later load them
imagepaths = []
root = "./leapgestrecog"

for dirname, dirs, files in os.walk(root):
    for fname in files:
        path = os.path.join(dirname, fname)
        if path.endswith(".png"):
            imagepaths.append(path)

print(len(imagepaths)) # If > 0, then a PNG image was loaded
categories = [fpath.split("/")[4] for _, fpath in enumerate(imagepaths)]
gt = [category.split("_")[0] for _, category in enumerate(categories)]
with open(csv_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    for i, fpath in enumerate(imagepaths):
        writer.writerow([fpath, gt[i]])

20000


## Dataloader
To facilitate using pytorch to build our cnn, we write a custom DataLoader class. This allows for on-demand loading of images, which are used to train our cnn.

In [5]:
class KaggleHandDetectionDataset(Dataset):
    """Custom loader for the Kaggle Hand Detection Dataset"""
    
    def __init__(self, csv_file, transforms=None):
        """
        Args:
            csv_file (string): Path to the csv file with image filepaths and gt classes
            transforms (callable, optional): Optional transforms to be applied on a sample
        """
        self.images_frame = pd.read_csv(csv_file)
        #print(self.images_frame)
        self.transforms = transforms
        
    def __len__(self):
        return len(self.images_frame)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        img_name = self.images_frame.iloc[idx, 0]
        img = cv2.imread(img_name)
        y = int(self.images_frame.iloc[idx, 1])-1
        sample = {'image' : img, 'y' : y}
        
        if len(self.transforms) > 0:
            for _, transform in enumerate(self.transforms):
                sample = transform(sample)
        return sample


In [6]:
class Rescale(object):
    """Used to rescale an image to a given size. Useful for the CNN
    
    Args:
        output_size (tuple or int): Desired output size after rescaling. If tuple, output is matched to output_size.
        If int smaller of width/height is matched to output_size, keeping aspect ratio the same.
    """
    def __init__(self, output_size):
        assert isinstance(output_size, (int, tuple))
        self.output_size = output_size
        
    def __call__(self, sample):
        y = sample['y']
        image = sample['image']
        
        h, w = image.shape[:2]
        if isinstance(self.output_size, int):
            if h > w:
                new_h, new_w = self.output_size * h // w, self.output_size
            else:
                new_h, new_w = self.output_size, self.output_size * w // h
        else:
            new_h, new_w = self.output_size
        
        img = cv2.resize(image, (new_h, new_w))
        #print(f"after resize: {img.shape}")
        return {'image' : img, 'y' : y}
    
class Recolor(object):
    """Used to recolor an image using cv2
    
    Args:
        flag (cv2.COLOR_): color to swap to
    """
    def __init__(self, color):
        self.color = color
    
    def __call__(self, sample):
        y = sample['y']
        image = sample['image']
        #print(f"before recolor: {image.shape}")
        
        # cvtColor to gray drops the damned channel dimension but we need it
        img_cvt = cv2.cvtColor(image, self.color)
        # fucking hack an extra dim to appease pytorch's bitchass
        img_cvt = np.expand_dims(img_cvt, axis=-1)
        #print(f"after exansion: {img_cvt.shape}")
        return {'image' : img_cvt, 'y' : y}

class ToTensor(object):
    """Convert ndarrays to pytorch Tensors"""
    
    def __call__(self, sample):
        y = sample['y']
        image = sample['image']
        
        # swap color axis b/c 
        # numpy img: H x W x C
        # torch img: C x H x W
        image = image.transpose((2, 0, 1))
        return {'image' : torch.from_numpy(image), 'y' : y}

In [7]:
def prediction_to_class_str(pred):
    classes = {0 : "palm", 1 : "L", 2 : "fist", 3 : "fist_moved", 4 : "thumb", 5 : "index", 6 : "ok", 7 : "palm_moved", 8 : "c", 9 : "down"}
    return classes[pred]

def classify_arbitrary_image(model, img):
    img_type = type(img)
    print(img_type)
    if img_type == torch.Tensor:
        print("tensor")
        img = img.float()
        img = img.unsqueeze(1)
    elif img_type == np.ndarray:
        print("np array")
        img = np.expand_dims(img, 1)
    else:
        print("error: something other than a torch.Tensor or an np.ndarray was passed as img")
    prediction = model(img)
    prediction = prediction.data.numpy()
    y_hat = np.argmax(prediction, axis=1)
    return prediction_to_class_str(y_hat[0])

def classify_many_images(model, imgs):
    # for now, assuming imgs is a list of images that are either np.ndarrays or torch.Tensors
    # labels will be given back in order images were given
    predictions = []
    for img in imgs:
        predictions.append(classify_arbitrary_image(model, img))
    return predictions

In [8]:
class HandNNModel(Module):
    def __init__(self):
        super().__init__()
        
        # input shape = (32, 256, 256) - (batch_size, w, h) from dataloader
        self.conv1 = Conv2d(1, 32, kernel_size=5) # output shape: (252, 252, 32)
        self.pool1 = MaxPool2d(2) # output shape: (121, 121, 32)
        self.conv2 = Conv2d(32, 64, kernel_size=3) # output shape: (119, 119, 64)
        self.pool2 = MaxPool2d(2) # output shape: (59, 59, 64) - torch uses floor by default
        self.conv3 = Conv2d(64, 64, kernel_size=3) # output shape: (57, 57, 64)
        self.pool3 = MaxPool2d(2) # output shape: (28, 28, 64)
        self.fc1 = Linear(30*30*64, 128) # output shape: (28*28*64, 128)
        self.fc2 = Linear(128, 10)
        self.activation = torch.nn.ReLU()
        
    def forward(self, X):
        X = self.activation(self.conv1(X))
        X = self.pool1(X)
        X = self.activation(self.conv2(X))
        X = self.pool2(X)
        X = self.activation(self.conv3(X))
        X = self.pool3(X)
        X = torch.flatten(X, 1) # flatten with start_dim = 1
        X = self.fc1(X)
        X = self.fc2(X)
        output = F.softmax(X)
        return output

In [9]:
resize = Rescale((256,256))
recolor = Recolor(cv2.COLOR_BGR2GRAY)
to_tensor = ToTensor()
transforms = [resize, recolor, to_tensor]

hand_dataset = KaggleHandDetectionDataset(csv_file="kaggle_images.csv", transforms=transforms)
df = pd.read_csv("kaggle_images.csv")

sample = hand_dataset[0]
print()
print(sample['image'].shape)
print(sample['y'])
print(sample)


torch.Size([1, 256, 256])
0
{'image': tensor([[[ 6,  6,  7,  ...,  5,  5,  4],
         [ 7,  6,  5,  ...,  6,  4,  5],
         [ 5,  5,  4,  ...,  3,  4,  5],
         ...,
         [ 5,  5,  6,  ...,  3,  4,  4],
         [ 4,  4,  4,  ...,  6,  4,  5],
         [ 5,  5,  4,  ..., 34,  7, 13]]], dtype=torch.uint8), 'y': 0}


In [10]:
"""
sample = hand_dataset[0]
img = sample['image']
#recolor = Recolor(cv2.COLOR_BGR2GRAY)
#as_gray = recolor(sample)
#scale = Rescale((256, 256))
#tns = ToTensor()
#as_tns = tns(sample)
#print(type(as_tns['image']))
#new_img = scale(sample)
print(sample['image'])
# complains b/c the gpu owns it, not the cpu since it's a tensor
cv2.imshow('original', sample['image'])
cv2.waitKey(2000)
#cv2.imshow('recolored', as_gray['image'])
#cv2.waitKey(2000)
#cv2.imshow('rescaled', new_img['image'])
#cv2.waitKey(3000)
cv2.destroyAllWindows()
"""

"\nsample = hand_dataset[0]\nimg = sample['image']\n#recolor = Recolor(cv2.COLOR_BGR2GRAY)\n#as_gray = recolor(sample)\n#scale = Rescale((256, 256))\n#tns = ToTensor()\n#as_tns = tns(sample)\n#print(type(as_tns['image']))\n#new_img = scale(sample)\nprint(sample['image'])\n# complains b/c the gpu owns it, not the cpu since it's a tensor\ncv2.imshow('original', sample['image'])\ncv2.waitKey(2000)\n#cv2.imshow('recolored', as_gray['image'])\n#cv2.waitKey(2000)\n#cv2.imshow('rescaled', new_img['image'])\n#cv2.waitKey(3000)\ncv2.destroyAllWindows()\n"

In [11]:
dataloader = DataLoader(hand_dataset, batch_size=4)
for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, sample_batched['image'].size())
    if i_batch == 3:
        break

0 torch.Size([4, 1, 256, 256])
1 torch.Size([4, 1, 256, 256])
2 torch.Size([4, 1, 256, 256])
3 torch.Size([4, 1, 256, 256])


## Train the model using the Custom Dataloader
Below, we will actually train our CNN model

In [15]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
if use_cuda:
    params = {'batch_size' : 128, 'shuffle': True, 'num_workers' : 8, 'pin_memory' : True}
else:
    params = {'batch_size' : 128, 'shuffle': True, 'num_workers' : 8}
    

max_epochs = 100
data_loader = DataLoader(hand_dataset, **params)
model = HandNNModel()
# move to GPU
if use_cuda:
    model.cuda()

# batch_size, num_channels, w, h
#random_data = torch.rand((1, 1, 256, 256))

#result = model(random_data)

criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

imgs, labels = (next(iter(data_loader)))
start = time.time()
for epoch in range(700):
    running_loss = 0.0
    epoch_start = time.time()
    for i, data in enumerate(data_loader):
        imgs, labels = data['image'], data['y']
        # move to GPU
        imgs, labels = imgs.cuda(), labels.cuda()
        imgs = imgs.float()
        optimizer.zero_grad()
        
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if i % 10 == 0:
            print(f"epoch {epoch}: running loss = {running_loss}")
    epoch_end = time.time()
    print(f"epoch {epoch} runtime = {epoch_end - epoch_start}")
end = time.time()
print(f"Total training time = {end - start}")

# make sure to save the model so we don't need to train again
save_path = "./deep_trained_cnn.pkl"
torch.save(model.state_dict(), save_path)

  output = F.softmax(X)


epoch 0: running loss = 2.3196702003479004
epoch 0: running loss = 25.20462417602539
epoch 0: running loss = 47.40992331504822
epoch 0: running loss = 69.42883324623108
epoch 0: running loss = 91.16979622840881
epoch 0: running loss = 113.14774537086487
epoch 0: running loss = 134.9151792526245
epoch 0: running loss = 156.847589969635
epoch 0: running loss = 178.50759530067444
epoch 0: running loss = 200.01586842536926
epoch 0: running loss = 221.5930187702179
epoch 0: running loss = 243.16094040870667
epoch 0: running loss = 264.3463137149811
epoch 0: running loss = 285.4881417751312
epoch 0: running loss = 306.66056871414185
epoch 0: running loss = 327.83641028404236
epoch 0 runtime = 40.115034341812134
epoch 1: running loss = 2.1650545597076416
epoch 1: running loss = 23.028441429138184
epoch 1: running loss = 44.230321168899536
epoch 1: running loss = 65.19125866889954
epoch 1: running loss = 85.73125159740448
epoch 1: running loss = 105.85331881046295
epoch 1: running loss = 125.7

In [12]:
#print(model.named_parameters)
params = model.parameters()

## Saving Trained Model

In [13]:
save_path = "./trained_test_cnn.pkl"
torch.save(model.state_dict(), save_path)

## Loading the model
Load the model from disk later for evaluation

In [14]:
loaded_model = HandNNModel()
loaded_model.load_state_dict(torch.load(save_path))

<All keys matched successfully>

## Confusion Matrix and Classification Report (from sklearn)


In [16]:
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
def make_stats(y, y_hat):
    cm = confusion_matrix(y, y_hat)
    cm_df = pd.DataFrame(cm, columns=[str(i) for i in range(10)])
    report = classification_report(y, y_hat)
    return cm_df, report

In [20]:
total_samples = 0
total_misclass = 0
all_y = np.array([], dtype=np.uint8)
all_y_hat = np.array([], dtype=np.uint8)
for i, sample in enumerate(data_loader):
    y = sample['y']
    y = y.data.numpy()
    #print(f"y = {y}")
    #print(f"y_shape = {sample['y'].shape[0]}")
    images = (sample['image'])
    images = images.float()
    model = model.cpu()
    #images = images.cuda()
    #test_img = images[0]
    #test_img = test_img.unsqueeze(1)
    #print(type(test_img))
    #some_str = classify_arbitrary_image(model, test_img)
    #print(f"class str for img 0 = {some_str}")
    predictions = model(images)
    predictions = predictions.cpu()
    predictions = predictions.data.numpy()
    y_hat = np.argmax(predictions, axis=1)
    misclass = np.sum(np.where(y != y_hat, 1, 0))
    #print(f"y_hat = {y_hat}")
    total_samples += y.shape[0]
    total_misclass += misclass
    all_y = np.append(all_y, y)
    all_y_hat = np.append(all_y_hat, y_hat)
    print(f"all_y = {all_y}")
    print(f"all_y_hat = {all_y_hat}")
    print(f"Number of Misclassifications = {misclass}")
    print(f"Sample acc = {(y.shape[0]-misclass)/y.shape[0]*100}")
overall_acc = (total_samples - total_misclass)/total_samples
print(f"Overall Accuracy = {overall_acc}")

  output = F.softmax(X)


all_y = [4 7 7 5 5 4 2 5 6 2 1 0 5 6 4 8 5 3 3 0 8 1 4 8 0 9 5 8 0 2 3 4 1 6 0 5 6
 5 8 9 6 1 4 7 1 9 4 3 3 8 0 6 5 2 3 2 0 8 1 3 6 4 1 2 4 4 0 1 5 1 8 7 5 9
 2 2 9 0 0 5 3 5 3 9 2 2 4 4 5 8 8 6 5 3 6 7 3 7 5 7 0 9 0 7 0 7 1 6 5 9 9
 2 9 2 6 1 2 9 6 9 6 5 3 2 8 0 1 1]
all_y_hat = [4 7 7 5 5 4 2 5 6 2 1 0 5 6 4 8 5 3 3 0 8 1 4 8 0 9 5 8 0 2 3 4 1 6 0 5 6
 5 8 9 6 1 4 7 1 9 4 3 3 8 0 6 5 2 3 2 0 8 1 3 6 4 1 2 4 4 0 1 5 1 8 7 5 9
 2 2 9 0 0 5 3 5 3 9 2 2 4 4 5 8 8 6 5 3 6 7 3 7 5 7 0 9 0 7 0 7 1 6 5 9 9
 2 9 2 6 1 2 9 6 9 6 5 3 2 8 0 1 1]
Number of Misclassifications = 0
Sample acc = 100.0
all_y = [4 7 7 5 5 4 2 5 6 2 1 0 5 6 4 8 5 3 3 0 8 1 4 8 0 9 5 8 0 2 3 4 1 6 0 5 6
 5 8 9 6 1 4 7 1 9 4 3 3 8 0 6 5 2 3 2 0 8 1 3 6 4 1 2 4 4 0 1 5 1 8 7 5 9
 2 2 9 0 0 5 3 5 3 9 2 2 4 4 5 8 8 6 5 3 6 7 3 7 5 7 0 9 0 7 0 7 1 6 5 9 9
 2 9 2 6 1 2 9 6 9 6 5 3 2 8 0 1 1 3 9 3 4 1 7 4 2 7 5 4 7 1 1 2 6 8 5 6 5
 0 7 0 7 5 0 8 1 7 0 1 3 5 0 3 8 6 2 9 2 9 5 2 8 8 6 9 7 1 6 9 3 4 3 8 7 8
 4 1 0 5 8 2 7 9 1 9 5 

In [21]:
cm_df, report = make_stats(all_y, all_y_hat)
print(cm_df.to_markdown())
print(report)

|    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 |    8 |    9 |
|---:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
|  0 | 2000 |    0 |    0 |    0 |    0 |    0 |    0 |    0 |    0 |    0 |
|  1 |    0 | 2000 |    0 |    0 |    0 |    0 |    0 |    0 |    0 |    0 |
|  2 |    0 |    0 | 2000 |    0 |    0 |    0 |    0 |    0 |    0 |    0 |
|  3 |    0 |    0 |    0 | 2000 |    0 |    0 |    0 |    0 |    0 |    0 |
|  4 |    0 |    0 |   33 |   27 | 1900 |   18 |   11 |    0 |    6 |    5 |
|  5 |    0 |    0 |    0 |    0 |    0 | 2000 |    0 |    0 |    0 |    0 |
|  6 |    0 |    0 |    0 |    0 |    0 |    0 | 2000 |    0 |    0 |    0 |
|  7 |    0 |    0 |    0 |    0 |    0 |    0 |    0 | 2000 |    0 |    0 |
|  8 |    0 |    0 |    0 |    0 |    0 |    0 |    0 |    0 | 2000 |    0 |
|  9 |    0 |    0 |    0 |    0 |    0 |    0 |    0 |    0 |    0 | 2000 |
              precision    recall  f1-score   support

           0       1.