# Gesture Recognition Project

In [1]:
import numpy as np
import imageio
from PIL import Image
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras
from keras.models import Sequential, Model
from keras.layers import Dense, GRU, Dropout, TimeDistributed, Flatten, BatchNormalization, Activation
from keras.layers import Conv3D, MaxPooling3D, Conv2D, MaxPooling2D, LSTM, ConvLSTM2D, GlobalAveragePooling2D, GlobalAveragePooling3D
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras import optimizers

import random as rn
import datetime
import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Print python, jupyter and all the imported library versions
import sys
import jupyter_core

print("python : {0}".format(sys.version))
print("jupyter : {0}".format(jupyter_core.__version__))
print("numpy : {0}".format(np.__version__))
print("tensorflow : {0}".format(tf.__version__))
print("keras : {0}".format(keras.__version__))
print("PIL Image : {0}".format(Image.__version__))
print("imageio : {0}".format(imageio.__version__))

python : 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:22:19) [Clang 14.0.6 ]
jupyter : 5.7.2
numpy : 1.26.4
tensorflow : 2.18.0
keras : 3.8.0
PIL Image : 10.4.0
imageio : 2.33.1


In [3]:
# Set random seed for all the libraries to keep results as consistent as possible

np.random.seed(30)
rn.seed(30)
tf.random.set_seed(30)

In [4]:
# Read training and validation doc containing folder names and corresponding label
train_doc = np.random.permutation(open('datasets/gesture_recognition/train.csv').readlines())
val_doc = np.random.permutation(open('datasets/gesture_recognition/val.csv').readlines())

#experiment with the batch size
batch_size = 16
print("batch_size =", batch_size)

#create a list of image numbers you want to use for a particular video
img_idx = range(3, 28)
print("img_idx =", list(img_idx))

# Initialize the 3 dimensions of video
x = len(img_idx)
y = 120
z = 120
print("x =", x, "; y =", y, "; z =", z)

batch_size = 16
img_idx = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
x = 25 ; y = 120 ; z = 120


# Generator

In [5]:
def generator(source_path, folder_list, batch_size):
    # print('Source path =', source_path, '; batch size =', batch_size)

    crop_box = (0, 20, 120, 140)
    # print("crop_box =", crop_box)
    
    while True:
        
        random_list = np.random.permutation(folder_list)
        # print("random_list =", random_list)

        # calculate the number of batches
        if (len(folder_list) % batch_size) == 0:
            num_batches = int(len(folder_list)/batch_size)
        else:
            num_batches = (len(folder_list)//batch_size) + 1
        # print("num_batches =", num_batches)

        # we iterate over the number of batches
        for batch in range(num_batches): 

            # x is the number of images you use for each video, 
            # (y,z) is the final size of the input images and 3 is the number of channels RGB
            batch_data = np.zeros((batch_size,x,y,z,3)) 

            # batch_labels is the one hot representation of the output
            batch_labels = np.zeros((batch_size,5))

            # iterate over the batch_size
            for folder in range(batch_size): 

                folder_idx = folder + (batch*batch_size)
                if folder_idx >= len(folder_list):
                    break
                
                img_details = random_list[folder_idx].strip().split(';')

                # read all the images in the folder
                imgs = [file for file in os.listdir(source_path+'/'+ img_details[0]) if os.path.isfile(os.path.join(source_path+'/'+ img_details[0], file))] 
                # print("imgs =", imgs)

                # Iterate iver the frames/images of a folder to read them in
                for idx,item in enumerate(img_idx): 

                    image = imageio.imread(source_path+'/'+ img_details[0]+'/'+imgs[item]).astype(np.float32)
                    img_shape = image.shape
                    
                    #crop the images and resize them. Note that the images are of 2 different shape 
                    #and the conv3D will throw error if the inputs in a batch have different shapes
                    if (img_shape[0] == 120) and (img_shape[1] == 160):
                        image = image[crop_box[0]:crop_box[2], crop_box[1]:crop_box[3]]
                    
                    image = np.array(Image.fromarray(image.astype(np.uint8)).resize((y, z))).astype(np.float32)

                    batch_data[folder,idx,:,:,0] = image[:,:,0] / 255 #normalise and feed in the image
                    batch_data[folder,idx,:,:,1] = image[:,:,1] / 255 #normalise and feed in the image
                    batch_data[folder,idx,:,:,2] = image[:,:,2] / 255 #normalise and feed in the image
                    
                batch_labels[folder, int(img_details[2])] = 1

            #you yield the batch_data and the batch_labels, remember what does yield do
            yield batch_data, batch_labels 

In [6]:
# Load the train and validation directory path
train_path = 'datasets/gesture_recognition/train'
val_path = 'datasets/gesture_recognition/val'

num_train_sequences = len(train_doc)
print('# training sequences =', num_train_sequences)

num_val_sequences = len(val_doc)
print('# validation sequences =', num_val_sequences)

# choose the number of epochs
num_epochs = 50
print ('# epochs =', num_epochs)

input_shape = (x,y,z,3)
print("# input_shape =", input_shape)

# training sequences = 663
# validation sequences = 100
# epochs = 50
# input_shape = (25, 120, 120, 3)


In [7]:
# Define function to draw the accuracy and loss results
def visualize_results(model, epochs, results):

    accuracy = results['categorical_accuracy']
    validation_accuracy = results['val_categorical_accuracy']
    
    loss = results['loss']
    validation_loss = results['val_loss']
    epochs_range = range(epochs)

    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, accuracy, label='Training Accuracy')
    plt.plot(epochs_range, validation_accuracy, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy - ' + model)
    
    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, validation_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss - ' + model)
    plt.show()    

# Models

## Model I (Conv3D)

In [None]:
# model = Sequential()

# model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=input_shape))
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))
# model.add(Dropout(0.1))

# model.add(Flatten())
# model.add(Dense(32, activation="relu"))
# model.add(Dropout(0.1))
# model.add(Dense(5, activation="softmax"))

## Model II (Conv3D)

In [None]:
# model = Sequential()

# model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=input_shape))
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))
# model.add(Dropout(0.1))

# model.add(Flatten())
# model.add(Dense(32, activation="relu"))
# model.add(Dropout(0.1))
# model.add(Dense(256, activation="relu"))
# model.add(Dropout(0.1))
# model.add(Dense(5, activation="softmax"))

## Model III (Conv3D)

In [None]:
# model = Sequential()

# model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=input_shape))
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))
# model.add(Dropout(0.1))

# model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu', input_shape=input_shape))
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))
# model.add(Dropout(0.1))

# model.add(Flatten())
# model.add(Dense(32, activation="relu"))
# model.add(Dropout(0.1))
# model.add(Dense(256, activation="relu"))
# model.add(Dropout(0.1))
# model.add(Dense(5, activation="softmax"))

## Model IV (Conv3D)

In [None]:
# model = Sequential()

# model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=input_shape))
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))
# model.add(BatchNormalization())
# model.add(Dropout(0.1))

# model.add(Conv3D(64, kernel_size=(3, 3, 3), activation='relu'))
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))
# model.add(BatchNormalization())
# model.add(Dropout(0.1))

# model.add(Flatten())
# model.add(Dense(32, activation="relu"))
# model.add(Dropout(0.1))
# model.add(BatchNormalization())
# model.add(Dense(256, activation="relu"))
# model.add(Dropout(0.1))
# model.add(BatchNormalization())
# model.add(Dense(5, activation="softmax"))

## Model V (Conv3D)

In [None]:
# model = Sequential()

# model.add(Conv3D(32, kernel_size=(3, 3, 3), padding="same", activation='relu', input_shape=input_shape))
# model.add(Conv3D(32, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))
# model.add(BatchNormalization())
# model.add(Dropout(0.1))

# model.add(Conv3D(64, kernel_size=(3, 3, 3), padding="same", activation='relu', input_shape=input_shape))
# model.add(Conv3D(64, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))
# model.add(BatchNormalization())
# model.add(Dropout(0.1))

# model.add(Conv3D(128, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(Conv3D(128, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))
# model.add(BatchNormalization())
# model.add(Dropout(0.1))

# model.add(Flatten())
# model.add(Dense(32, activation="relu"))
# model.add(Dropout(0.1))
# model.add(BatchNormalization())
# model.add(Dense(256, activation="relu"))
# model.add(Dropout(0.1))
# model.add(BatchNormalization())
# model.add(Dense(5, activation="softmax"))

## Model VI (Conv3D)

In [None]:
# model = Sequential()

# model.add(Conv3D(16, kernel_size=(3, 3, 3), padding="same", activation='relu', input_shape=input_shape))
# model.add(BatchNormalization())
# model.add(Conv3D(16, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# model.add(Conv3D(32, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(BatchNormalization())
# model.add(Conv3D(32, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# model.add(Conv3D(64, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(BatchNormalization())
# model.add(Conv3D(64, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling3D(pool_size=(1, 2, 2)))

# model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu', input_shape=input_shape))
# model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
# model.add(MaxPooling3D(pool_size=(1, 2, 2)))

# model.add(Flatten())
# model.add(Dense(128, activation="relu"))
# model.add(BatchNormalization())
# model.add(Dropout(0.3))
# model.add(Dense(512, activation="relu"))
# model.add(BatchNormalization())
# model.add(Dense(5, activation="softmax"))

## Model VII (Conv3D)

In [None]:
# model = Sequential()

# model.add(Conv3D(16, kernel_size=(3, 3, 3), padding="same", activation='relu', input_shape=input_shape))
# model.add(BatchNormalization())
# model.add(Conv3D(16, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# model.add(Conv3D(32, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(BatchNormalization())
# model.add(Conv3D(32, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling3D(pool_size=(2, 2, 2)))

# model.add(Conv3D(64, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(BatchNormalization())
# model.add(Conv3D(64, kernel_size=(3, 3, 3), padding="same", activation='relu'))
# model.add(BatchNormalization())
# model.add(MaxPooling3D(pool_size=(1, 2, 2)))

# model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu', input_shape=input_shape))
# model.add(Conv3D(128, kernel_size=(3, 3, 3), activation='relu'))
# model.add(MaxPooling3D(pool_size=(1, 2, 2)))

# model.add(GlobalAveragePooling3D())
# model.add(Dense(128, activation="relu"))
# model.add(BatchNormalization())
# model.add(Dropout(0.2))
# model.add(Dense(512, activation="relu"))
# model.add(BatchNormalization())
# model.add(Dense(5, activation="softmax"))

## Model VIII (Time Distributed Conv2D + GRU)

In [8]:
# model = Sequential()

# model.add(TimeDistributed(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'), input_shape=input_shape))
# model.add(BatchNormalization())
# model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))

# model.add(TimeDistributed(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'), input_shape=input_shape))
# model.add(BatchNormalization())
# model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))

# model.add(TimeDistributed(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'), input_shape=input_shape))
# model.add(BatchNormalization())
# model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))

# model.add(TimeDistributed(GlobalAveragePooling2D()))
# model.add(TimeDistributed(Dense(128, activation="relu")))
# model.add(BatchNormalization())
# model.add(Dropout(0.2))

# model.add(GRU(128))
# model.add(BatchNormalization())
# model.add(Dense(5, activation="softmax"))

## Model IX (Time Distributed Conv2D + ConvLSTM)

In [10]:
model = Sequential()

model.add(TimeDistributed(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'), input_shape=input_shape))
model.add(BatchNormalization())
model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))

model.add(TimeDistributed(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'), input_shape=input_shape))
model.add(BatchNormalization())
model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))

model.add(TimeDistributed(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'), input_shape=input_shape))
model.add(BatchNormalization())
model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))

model.add(ConvLSTM2D(16, kernel_size = (3, 3), return_sequences=False))
model.add(BatchNormalization())

model.add(GlobalAveragePooling2D())
model.add(Dense(128, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(512, activation="relu"))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(5, activation="softmax"))

## Final Model (Time Distributed Conv2D + GRU)

In [None]:
model = Sequential()

model.add(TimeDistributed(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'), input_shape=input_shape))
model.add(BatchNormalization())
model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))

model.add(TimeDistributed(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'), input_shape=input_shape))
model.add(BatchNormalization())
model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))

model.add(TimeDistributed(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'), input_shape=input_shape))
model.add(BatchNormalization())
model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))

model.add(TimeDistributed(GlobalAveragePooling2D()))
model.add(TimeDistributed(Dense(128, activation="relu")))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(GRU(128))
model.add(BatchNormalization())
model.add(Dense(5, activation="softmax"))

In [11]:
#write your optimizer
optimiser = optimizers.Adam(learning_rate=0.01) 
model.compile(optimizer=optimiser, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
print (model.summary())

None


In [None]:
# Create train and val generators
train_generator = generator(train_path, train_doc, batch_size)
val_generator = generator(val_path, val_doc, batch_size)

In [None]:
# Create model name using current timestamp
curr_dt_time = datetime.datetime.now()
model_name = 'results/model_init' + '_' + str(curr_dt_time).replace(' ','').replace(':','_') + '/'

# Check if directory with model name exists
if not os.path.exists(model_name):
    os.mkdir(model_name)

# Create file path pattern
filepath = model_name + 'model-{epoch:05d}-{loss:.5f}-{categorical_accuracy:.5f}-{val_loss:.5f}-{val_categorical_accuracy:.5f}.h5'

# Prepare checkpoint after every epoch, configure early stopping in case learning is not happening and reducing learning rate on plateau
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=False, mode='auto')
earlyStopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto'),
reduceLearningRate = ReduceLROnPlateau(monitor='val_loss', factor=0.2, verbose=1, patience=3)
callbacks_list = [checkpoint, earlyStopping, reduceLearningRate]

In [None]:
# calculate the steps for both training and validation data
if (num_train_sequences%batch_size) == 0:
    steps_per_epoch = int(num_train_sequences/batch_size)
else:
    steps_per_epoch = (num_train_sequences//batch_size) + 1

if (num_val_sequences%batch_size) == 0:
    validation_steps = int(num_val_sequences/batch_size)
else:
    validation_steps = (num_val_sequences//batch_size) + 1

print("batch_size =", batch_size)
print("step_per_epoch =", steps_per_epoch)
print("validation_steps =", validation_steps)

In [None]:
# Fit the model by providing the generator
history = model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs, verbose=1, 
                    callbacks=callbacks_list, validation_data=val_generator, 
                    validation_steps=validation_steps, class_weight=None, initial_epoch=0)

In [None]:
# Visualize the accuracy and loss for model
visualize_results("Model", num_epochs, history.history)