In [12]:
# Load InceptionV3 architecture
from keras.applications import InceptionV3

# Load Keras/Tensorflow libraries
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout

# Import array libraries
import numpy as np

In [14]:
# Load the data
# NOTE: Replace with combined file later
x_data   = np.load('../data_collection/data/x_data_20241030_0130.npy')
y_data   = np.load('../data_collection/data/y_data_20241030_0130.npy')
img_data = np.load('../data_collection/data/img_data_20241030_0130.npy')

# Print sizes for verification purposes
print(f"x_data size: {x_data.shape}")
print(f"y_data size: {y_data.shape}")
print(f"img_data size: {img_data.shape}")

# Define key constants
IMG_SIZE = img_data.shape[1]
FRAMES_PER_SEQ = x_data.shape[-1]
N_GESTURES = y_data.shape[1]

x_data size: (2, 21, 3, 10)
y_data size: (2, 6)
img_data size: (2, 128, 128, 3, 10)


In [6]:


# Define original InceptionV3 model
inc_v3 = InceptionV3(include_top=True,
    weights=None,
    input_tensor=None,
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    pooling=None,
    classes=1000,
    classifier_activation="softmax"
)

# Define feature extractor from inc_v3
feat_extractor = Model(inputs=inc_v3.input, outputs=inc_v3.layers[-2].output)
#feat_extractor.summary()

In [5]:
feat_extractor.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 128, 128, 3)]        0         []                            
                                                                                                  
 conv2d (Conv2D)             (None, 63, 63, 32)           864       ['input_1[0][0]']             
                                                                                                  
 batch_normalization (Batch  (None, 63, 63, 32)           96        ['conv2d[0][0]']              
 Normalization)                                                                                   
                                                                                                  
 activation (Activation)     (None, 63, 63, 32)           0         ['batch_normalization[0][0

In [23]:
# Define LSTM model
# TODO: Consider converting to functional model later
inputs = Input(shape=(FRAMES_PER_SEQ, 2048))
x = LSTM(128, return_sequences=True)(inputs)
x = LSTM(128)(x)
x = Dropout(0.5)(x)
x = Dense(64, activation='relu')(x)
outputs = Dense(N_GESTURES, activation='softmax')(x)

lstm_model = Model(inputs, outputs, name="LSTM_Gesture_Classifier")
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
lstm_model.summary()

Model: "LSTM_Gesture_Classifier"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, 10, 2048)]        0         
                                                                 
 lstm_12 (LSTM)              (None, 10, 128)           1114624   
                                                                 
 lstm_13 (LSTM)              (None, 128)               131584    
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_8 (Dense)             (None, 64)                8256      
                                                                 
 dense_9 (Dense)             (None, 6)                 390       
                                                                 
Total params: 1254854 (4.79 MB)
Trainable p

In [24]:
# Define combined model forward pass
# TODO: Convert to a class later
def combined_forward_pass(gesture_img_seq):
    # First compute the (10, 2048) feature vectors from InceptionV3 feature extractor
    features = np.zeros((10, 2048))
    
    for i in range(FRAMES_PER_SEQ):
        subimg = np.expand_dims(gesture_img_seq[:, :, :, i], 0)
        features[i] = feat_extractor.predict(subimg)
    
    print(f"Feature extractor output shape: {features.shape}")
    print(f"Feature extractor sample data: {features[0, :20]}")
        
    # Send these feature vectors to the LSTM 
    print(np.expand_dims(features, 0).shape)
    gesture_prob = lstm_model.predict(np.expand_dims(features, 0))
    print(f"LSTM Output: {gesture_prob}")

    # Return the argmax of the classification output from the LSTM model

combined_forward_pass(img_data[0, :, :, :, :])

Feature extractor output shape: (10, 2048)
Feature extractor sample data: [0.         0.         0.03584328 0.         0.04667887 0.04553762
 0.1035061  0.         0.         0.         0.         0.10088308
 0.12554559 0.08551036 0.         0.15554854 0.         0.08147701
 0.         0.00450905]
(1, 10, 2048)
LSTM Output: [[0.1671315  0.16944902 0.16919437 0.16626617 0.161529   0.16642997]]
