
# Gesture Recognition Project

This project involves recognizing hand gestures performed in front of a smart TV using video data captured by a webcam. 
The goal is to create a deep learning model that classifies hand gestures into five categories, which will correspond to specific TV commands such as increasing volume, pausing, etc.

### Problem Statement
We need to build a model that processes videos of hand gestures and correctly classifies them into one of five categories using Conv3D or CNN-RNN architectures.


In [9]:
# Import necessary libraries
import numpy as np
import pandas as pd
import os
from PIL import Image
from keras.utils import to_categorical

# Load the training and validation CSV files
train_csv = 'train.csv'
val_csv = 'val.csv'

train_df = pd.read_csv(train_csv)
val_df = pd.read_csv(val_csv)

# Clean up the data (split columns)
train_df_cleaned = train_df[train_df.columns[0]].str.split(';', expand=True)
val_df_cleaned = val_df[val_df.columns[0]].str.split(';', expand=True)
train_df_cleaned.columns = ['folder_name', 'gesture', 'label']
val_df_cleaned.columns = ['folder_name', 'gesture', 'label']
train_df_cleaned['label'] = train_df_cleaned['label'].astype(int)
val_df_cleaned['label'] = val_df_cleaned['label'].astype(int)

# Display cleaned data samples
print(train_df_cleaned.head())
print(val_df_cleaned.head())

# Define paths for training and validation data
train_path = 'train'
val_path = 'val'


                                folder_name         gesture  label
0  WIN_20180925_17_18_28_Pro_Left_Swipe_new  Left_Swipe_new      0
1  WIN_20180925_17_18_56_Pro_Left_Swipe_new  Left_Swipe_new      0
2  WIN_20180925_17_19_51_Pro_Left_Swipe_new  Left_Swipe_new      0
3  WIN_20180925_17_20_14_Pro_Left_Swipe_new  Left_Swipe_new      0
4  WIN_20180925_17_21_28_Pro_Left_Swipe_new  Left_Swipe_new      0
                                folder_name         gesture  label
0  WIN_20180925_17_43_01_Pro_Left_Swipe_new  Left_Swipe_new      0
1  WIN_20180925_18_01_40_Pro_Left_Swipe_new  Left_Swipe_new      0
2  WIN_20180925_18_03_21_Pro_Left_Swipe_new  Left_Swipe_new      0
3  WIN_20180926_16_46_22_Pro_Left_Swipe_new  Left_Swipe_new      0
4  WIN_20180926_16_47_09_Pro_Left_Swipe_new  Left_Swipe_new      0


In [10]:

# Define a generator function to yield batches of video frames and corresponding labels
def generator(source_path, folder_list, batch_size, image_size=(64, 64), frames=30):
    print('Source path =', source_path, '; batch size =', batch_size)
    while True:
        t = np.random.permutation(folder_list)
        num_batches = len(folder_list) // batch_size
        for batch in range(num_batches):
            batch_data = np.zeros((batch_size, frames, image_size[0], image_size[1], 3))
            batch_labels = np.zeros((batch_size, 5))  # One-hot encoding of labels
            for folder in range(batch_size):
                imgs = os.listdir(source_path + '/' + t[folder + (batch * batch_size)][0])  # Access folder_name correctly
                for idx in range(frames):
                    image = Image.open(source_path + '/' + t[folder + (batch * batch_size)][0] + '/' + imgs[idx])
                    image = image.resize(image_size)  # Resize using PIL
                    batch_data[folder, idx, :, :, :] = np.array(image) / 255.0  # Normalize image
                label = int(t[folder + (batch * batch_size)][2])  # Access label correctly
                batch_labels[folder, label] = 1  # One-hot encoding
            yield batch_data, batch_labels



### Model Architecture

In this section, we will build two architectures for processing video sequences:

1. **Conv3D Model**: A 3D convolutional neural network for spatiotemporal feature extraction.
2. **CNN + RNN Model**: A 2D CNN combined with a GRU (Gated Recurrent Unit) to handle temporal sequences.


In [14]:
# Conv3D Model
from keras.models import Sequential
from keras.layers import Conv3D, MaxPooling3D, Flatten, Dense, Dropout, BatchNormalization, Activation

def create_conv3d_model(input_shape=(30, 64, 64, 3)):
    model = Sequential()
    
    model.add(Conv3D(32, kernel_size=(3, 3, 3), input_shape=input_shape, padding='same'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling3D(pool_size=(2, 2, 2)))
    
    model.add(Conv3D(64, kernel_size=(3, 3, 3), padding='same'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling3D(pool_size=(2, 2, 2)))
    
    model.add(Conv3D(128, kernel_size=(3, 3, 3), padding='same'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(MaxPooling3D(pool_size=(2, 2, 2)))
    
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='softmax'))  # 5 gesture classes
    
    return model

# Instantiate the model
conv3d_model = create_conv3d_model()
conv3d_model.summary()


In [13]:

# CNN + RNN Model (GRU)
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, TimeDistributed, GRU, Dense, Flatten, Dropout, BatchNormalization, Activation

def create_cnn_rnn_model(input_shape=(30, 64, 64, 3)):
    model = Sequential()
    
    # TimeDistributed Conv2D layers for each frame
    model.add(TimeDistributed(Conv2D(32, kernel_size=(3, 3), padding='same'), input_shape=input_shape))
    model.add(TimeDistributed(BatchNormalization()))
    model.add(TimeDistributed(Activation('relu')))
    model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
    
    model.add(TimeDistributed(Conv2D(64, kernel_size=(3, 3), padding='same')))
    model.add(TimeDistributed(BatchNormalization()))
    model.add(TimeDistributed(Activation('relu')))
    model.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
    
    # GRU layer for temporal sequence processing
    model.add(TimeDistributed(Flatten()))
    model.add(GRU(128, return_sequences=False))
    
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='softmax'))  # 5 gesture classes
    
    return model

# Instantiate the model
cnn_rnn_model = create_cnn_rnn_model()
cnn_rnn_model.summary()


In [15]:

# Compile and train the models
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

# Compile Conv3D model
conv3d_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

## checkpoint = ModelCheckpoint('conv3d_best_model.h5', monitor='val_loss', verbose=1, save_best_only=True)
# checkpoint = ModelCheckpoint('conv3d_best_model.keras', monitor='val_loss', verbose=1, save_best_only=True)
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001)

# callbacks_list = [checkpoint, reduce_lr]

# Callbacks to save the best model and reduce learning rate if the model stops improving
checkpoint_model = ModelCheckpoint('conv3d_best_model.keras', monitor='val_loss', verbose=1, save_best_only=True)
checkpoint_weights = ModelCheckpoint('conv3d_best_weights.weights.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True)

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001)

callbacks_list = [checkpoint_model, checkpoint_weights, reduce_lr]

# Set batch size and steps per epoch
batch_size = 16
steps_per_epoch = len(train_df_cleaned) // batch_size
validation_steps = len(val_df_cleaned) // batch_size

# Define generators for training and validation
train_generator = generator(train_path, train_df_cleaned.values, batch_size=batch_size)
val_generator = generator(val_path, val_df_cleaned.values, batch_size=batch_size)

# Train the Conv3D model
conv3d_model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=20, 
                 verbose=1, callbacks=callbacks_list, validation_data=val_generator, 
                 validation_steps=validation_steps)

Source path = train ; batch size = 16
Epoch 1/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - categorical_accuracy: 0.2955 - loss: 26.9976Source path = val ; batch size = 16

Epoch 1: val_loss improved from inf to 1.95393, saving model to conv3d_best_model.keras

Epoch 1: val_loss improved from inf to 1.95393, saving model to conv3d_best_weights.weights.h5
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 5s/step - categorical_accuracy: 0.2966 - loss: 26.7365 - val_categorical_accuracy: 0.2500 - val_loss: 1.9539 - learning_rate: 0.0010
Epoch 2/20
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - categorical_accuracy: 0.4321 - loss: 1.7057
Epoch 2: val_loss did not improve from 1.95393

Epoch 2: val_loss did not improve from 1.95393
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 5s/step - categorical_accuracy: 0.4324 - loss: 1.6998 - val_categorical_accuracy: 0.2917 - val_loss: 1.9605 - learning_rate

<keras.src.callbacks.history.History at 0x7fcc5ffdbe00>

In [17]:
# Save the model weights manually after training
conv3d_model.save_weights('conv3d_final_weights.weights.h5')

In [18]:

# Load the best saved model
from keras.models import load_model

# Load best model and evaluate on validation set
#best_model = load_model('conv3d_best_model.h5
best_model = load_model('conv3d_best_model.keras')

# Evaluate the model
val_loss, val_acc = best_model.evaluate(val_generator, steps=validation_steps)
print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_acc}")

[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 940ms/step - categorical_accuracy: 0.7396 - loss: 0.6768
Validation Loss: 0.5793341994285583, Validation Accuracy: 0.7604166865348816



### Conclusion

In this notebook, we have built and trained two architectures for gesture recognition using video data: Conv3D and CNN + RNN (GRU). Both architectures were tested, and the best performing model was saved. Further experiments can be conducted to fine-tune the hyperparameters and improve accuracy.
