Imports

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
from tensorflow.keras import models

Constants

In [2]:
IMG_SIZE = 192 # Size of images that fit the MoveNet requirement

Loading the testing data

In [3]:

test_dir = "../testing_modified" # Change this to an appropriate directory for testing, 
                        # it should contain the test samples in the same format as in the training folder i.e.
                        # images in their class specific folders (for example "./testing/32/*.png")

test_ds = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    labels="inferred",
    label_mode="categorical",
    color_mode="rgb",
    image_size=(IMG_SIZE, IMG_SIZE),
    batch_size=None,
)

Found 12088 files belonging to 32 classes.


Skeleton data extraction with Movenet-SinglePose-Lightning model

In [4]:
module = hub.load("https://tfhub.dev/google/movenet/singlepose/lightning/4")

def movenet(input_image, module):


    model = module.signatures['serving_default']

    # SavedModel format expects tensor type of int32.
    input_image = tf.cast(input_image, dtype=tf.int32)
    input_image = tf.expand_dims(input_image, 0)
    # Run model inference.
    outputs = model(input_image)
    # Output is a [1, 1, 17, 3] tensor.
    keypoints_with_scores = outputs['output_0'].numpy()
    return keypoints_with_scores

skeleton_data = []
for image, label in test_ds: # this can take a while
    keypoints_with_scores = movenet(image, module)
    skeleton = keypoints_with_scores
    skeleton[skeleton[:, :, :, 2] < 0.3] = 0 # 0.3 is a recommended threshold for the score of nodes

    skeleton_data.append(skeleton.flatten())

skeleton_ds = tf.data.Dataset.from_tensor_slices(skeleton_data)

2024-03-30 16:08:37.229436: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Obtaining the labels

In [5]:
label_names = np.array(test_ds.class_names)
print(label_names)

['1' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '2' '20' '21' '22'
 '23' '24' '25' '26' '27' '28' '29' '3' '30' '31' '32' '4' '5' '6' '7' '8'
 '9']


Combining the datasets into multimodal inputs

In [6]:
def make_multimodal_dataset(image_and_label, skeleton):
    image, label = image_and_label
    return (image, skeleton), label

test_ds = tf.data.Dataset.zip(test_ds, skeleton_ds)
test_ds = test_ds.map(make_multimodal_dataset).batch(128)
test_ds = test_ds.prefetch(buffer_size=tf.data.AUTOTUNE)

In [7]:
for (example_imgs, example_labels), example_skeleton in test_ds.take(1):
    print(example_imgs.shape)
    print(example_labels.shape)
    print(example_skeleton.shape)


(128, 192, 192, 3)
(128, 51)
(128, 32)


2024-03-30 16:08:37.693730: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Defining the multimodal model base for loading the weights

In [8]:
def MultimodalModel():

    dense_model = models.Sequential(
        [
            layers.Input(shape=(51,)),

            layers.Dense(1024, 
                        activation="relu",
                        kernel_regularizer=tf.keras.regularizers.l1(0.0001),
                        kernel_initializer="lecun_uniform"),
            layers.Dense(512, 
                        activation="relu",
                        kernel_regularizer=tf.keras.regularizers.l1(0.0001),
                        kernel_initializer="lecun_uniform"),
            layers.Dense(
                256,
                activation="relu",
                kernel_regularizer=tf.keras.regularizers.l1(0.0001),
                kernel_initializer="lecun_uniform",
            )
        ],
        name="skeleton_dense"
    )

    base_model = tf.keras.applications.EfficientNetV2B0(
        weights="imagenet",
        include_top=False,
        input_shape=(192, 192, 3),
        include_preprocessing=True,
    )
    base_model.trainable = False

    cnn_model = models.Sequential(
        [
            layers.Input(shape=(192, 192, 3)),
            base_model,
            layers.GlobalAveragePooling2D(),
            layers.Dropout(0.5),
            layers.Dense(
                256,
                activation="relu",
                kernel_regularizer=tf.keras.regularizers.l1(0.0001),
                kernel_initializer="lecun_uniform",
            ),
            layers.Dropout(0.5),
            layers.Dense(
                128,
                activation="relu",
                kernel_regularizer=tf.keras.regularizers.l1(0.0001),
                kernel_initializer="lecun_uniform",
            ),
        ],
        name="Transfered_Efficient"
    )

    image_input = layers.Input(shape=(192, 192, 3))
    skeleton_input = layers.Input(shape=(51,))

    cnn_output = cnn_model(image_input)
    skeleton_output = dense_model(skeleton_input)

    concat = layers.Concatenate()([cnn_output, skeleton_output])
    dense_out = layers.Dense(128, activation="relu")(concat)
    model_output = layers.Dense(32, activation="softmax")(dense_out)
    return models.Model([image_input, skeleton_input], model_output)

model = MultimodalModel()

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=[tf.keras.metrics.CategoricalAccuracy(),
             tf.keras.metrics.TopKCategoricalAccuracy(k=5)]
)

Build the model and load pretrained weights to it. The model loading is done this way as there seems to be a bug regarding tf.keras.models.load_model() with the latest version of Tensorflow. A Github issue regarding this can be found [here](https://github.com/tensorflow/tensorflow/issues/63853).

In [9]:

model.predict((np.ones((1,192,192,3)),np.ones((1,51)))) # dummy input to "build" the model
model.load_weights("models/mm_model2_weights_only.weights.h5", skip_mismatch=False) # load the trained weights

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 922ms/step


  trackable.load_own_variables(weights_store.get(inner_path))


Model evaluation

In [10]:
metrics = model.evaluate(test_ds, return_dict=True, verbose=2)
print(f"Top 1 accuracy: {metrics['categorical_accuracy']*100:.2f} % | Top 5 accuracy: {metrics['top_k_categorical_accuracy']*100:.2f} %")

95/95 - 88s - 931ms/step - categorical_accuracy: 0.4702 - loss: 1.9576 - top_k_categorical_accuracy: 0.8926
Top 1 accuracy: 47.02 % | Top 5 accuracy: 89.26 %
