In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import mediapipe as mp
from tensorflow.keras.models import model_from_json
from time import sleep
from mpl_toolkits.mplot3d import Axes3D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_hands = mp.solutions.hands

def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results

def draw_styled_landmarks(image, results):
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                image,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style())


def extract_keypoints(results):
    keypoints = []
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            hand = np.array([[res.x, res.y, res.z] for res in hand_landmarks.landmark]).flatten() if hand_landmarks else np.zeros(21*3)
            keypoints.append(hand)
    return np.concatenate(keypoints) if keypoints else np.zeros(21*3)

# Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data1')

actions = np.array(['A','Absen','akhir','apung','awal','B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'])

no_sequences = 30

sequence_length = 30

In [None]:
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

# cap = cv2.VideoCapture(0)
# Set mediapipe model
with mp_hands.Hands(
    model_complexity=0,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5) as hands:

    # NEW LOOP
    # Loop through actions
    for action in actions:
        # Loop through sequences aka videos
        for sequence in range(no_sequences):
            # Loop through video length aka sequence length
            for frame_num in range(sequence_length):

                # Read feed
                # ret, frame = cap.read()
                frame=cv2.imread('Image/{}/{}.png'.format(action,sequence))
                # frame=cv2.imread('{}{}.png'.format(action,sequence))
                # frame=cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)

                # Make detections
                image, results = mediapipe_detection(frame, hands)
#                 print(results)

                # Draw landmarks
                draw_styled_landmarks(image, results)

                # NEW Apply wait logic
                if frame_num == 0:
                    cv2.putText(image, 'STARTING COLLECTION', (120,200),
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)
                    cv2.waitKey(200)
                else:
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show to screen
                    cv2.imshow('OpenCV Feed', image)

                # NEW Export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                # Break gracefully
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

    # cap.release()
    cv2.destroyAllWindows()

In [None]:
def visualize_hand_landmarks(action, sequence, frame_num):
    # Load the keypoints from the .npy file
    npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num) + '.npy')
    keypoints = np.load(npy_path)

    # Extract 3D coordinates from the loaded keypoints for both hands
    rh_3d = keypoints[:63].reshape((21, 3))  # Right hand

    # Check if landmarks for the left hand exist before attempting to reshape
    if len(keypoints) > 63:
        lh_3d = keypoints[63:].reshape((21, 3))  # Left hand
    else:
        lh_3d = None  # No left-hand landmarks

    # Create a 3D plot
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    # Plot right-hand landmarks
    ax.scatter(rh_3d[:, 0], rh_3d[:, 1], rh_3d[:, 2], marker='o', s=50, c='r', label='Right Hand Landmarks')

    # Plot connections between landmarks for the right hand
    connections = mp_hands.HAND_CONNECTIONS
    for connection in connections:
        start_point = connection[0]
        end_point = connection[1]
        ax.plot([rh_3d[start_point, 0], rh_3d[end_point, 0]],
                [rh_3d[start_point, 1], rh_3d[end_point, 1]],
                [rh_3d[start_point, 2], rh_3d[end_point, 2]], c='b')

    # Plot left-hand landmarks and connections if they exist
    if lh_3d is not None:
        ax.scatter(lh_3d[:, 0], lh_3d[:, 1], lh_3d[:, 2], marker='o', s=50, c='g', label='Left Hand Landmarks')
        for connection in connections:
            start_point = connection[0] + 21
            end_point = connection[1] + 21
            ax.plot([lh_3d[start_point-21, 0], lh_3d[end_point-21, 0]],
                    [lh_3d[start_point-21, 1], lh_3d[end_point-21, 1]],
                    [lh_3d[start_point-21, 2], lh_3d[end_point-21, 2]], c='b')

    # Set axis labels
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')

    # Set plot title
    ax.set_title(f'3D Hand Landmarks with Connections - Action: {action}, Sequence: {sequence}, Frame: {frame_num}')

    # Display the plot
    plt.show()

action = 'Absen'
sequence = 9
frame_num = 0

visualize_hand_landmarks(action, sequence, frame_num)

In [None]:
# Constants
keypoints_per_frame = 63  # 21 keypoints * 3 dimensions
DATA_PATH = 'MP_Data1'
actions = np.array(['A', 'Absen', 'akhir', 'apung', 'awal', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'])
no_sequences = 30
sequence_length = 30

# Load Data
label_map = {label: num for num, label in enumerate(actions)}
sequences, labels = [], []
for action in actions:
    for sequence in range(no_sequences):
        window = []
        for frame_num in range(sequence_length):
            filepath = os.path.join(DATA_PATH, action, str(sequence), f"{frame_num}.npy")
            if not os.path.exists(filepath):
                continue  # Skip if file does not exist
            res = np.load(filepath)
            if res.shape[0] != keypoints_per_frame:
                if res.shape[0] > keypoints_per_frame:
                    res = res[:keypoints_per_frame]  # Trim if too long
                else:
                    res = np.pad(res, (0, keypoints_per_frame - res.shape[0]), 'constant')  # Pad if too short
            window.append(res.flatten())
        sequences.append(window)
        labels.append(label_map[action])

# Convert to numpy array and reshape
X = np.array(sequences).reshape(-1, sequence_length, keypoints_per_frame, 1)  # Added extra dimension for channels
y = to_categorical(labels, num_classes=len(actions))

# Train/validation/test split
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

# Print out the shapes of the datasets
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

# Build the model
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(sequence_length, keypoints_per_frame, 1)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(actions), activation='softmax'))

# Compile the model
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping callback
# early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with validation and early stopping
history = model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.2)

# Evaluate the model
eval_result = model.evaluate(X_test, y_test)
print(f"Test Loss: {eval_result[0]}")
print(f"Test Accuracy: {eval_result[1]}")

# Save the model
model.save('final_model.h5')

# Save the model architecture as JSON
model_json = model.to_json()
with open('final_model.json', 'w') as json_file:
    json_file.write(model_json)
print("Model architecture saved as JSON.")

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
# Load the model architecture and weights
json_file = open('final_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights('final_model.h5')

# Setup MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(model_complexity=0, min_detection_confidence=0.5, min_tracking_confidence=0.5)
actions = np.array(['A','Absen','akhir','apung','awal','B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'])
# Function to process a single frame's keypoints
def process_keypoints(frame, hands_model):
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands_model.process(rgb_frame)
    keypoints = np.zeros((21 * 3,))
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            for i, lm in enumerate(hand_landmarks.landmark):
                keypoints[i * 3] = lm.x
                keypoints[i * 3 + 1] = lm.y
                keypoints[i * 3 + 2] = lm.z
        return keypoints, True, results.multi_hand_landmarks
    return keypoints, False, None

cap = cv2.VideoCapture(0)

keypoints_sequence = []  # To store a sequence of 30 frames' keypoints

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        continue

    # Process the current frame to get keypoints and hand landmarks
    keypoints, landmarks_detected, hand_landmarks = process_keypoints(frame, hands)

    # Draw landmarks and bounding box if landmarks are detected
    if landmarks_detected:
        # Draw hand landmarks
        mp_drawing.draw_landmarks(frame, hand_landmarks[0], mp_hands.HAND_CONNECTIONS)

        # Get bounding box coordinates
        bbox_coords = []
        for lm in hand_landmarks[0].landmark:
            h, w, _ = frame.shape
            x, y = int(lm.x * w), int(lm.y * h)
            bbox_coords.append((x, y))

        # Draw bounding box
        bbox_coords = np.array(bbox_coords)
        bbox = cv2.boundingRect(bbox_coords)
        cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[0] + bbox[2], bbox[1] + bbox[3]), (255, 0, 0), 2)

        # Append the processed keypoints to the sequence
        keypoints_sequence.append(keypoints)

        # Once we have a full sequence, make a prediction
        if len(keypoints_sequence) == 30:
            keypoints_data = np.array(keypoints_sequence).reshape(-1, 30, 63, 1)
            prediction = model.predict(keypoints_data)
            gesture_id = np.argmax(prediction)

            # Display the prediction on the frame
            gesture_label = actions[gesture_id]
            cv2.putText(frame, f'Gesture Label: {gesture_label}', (10, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

            # Reset the sequence
            keypoints_sequence = []

    # Display the frame
    cv2.imshow('Hand Gesture Recognition', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()