In [7]:
import cv2
import numpy as np
import tensorflow as tf
import mediapipe as mp
import pandas as pd
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [8]:
# Initialize MediaPipe Holistic
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Define Actions
actions = ["No_action", "No_ball", "clap"]
current_action_index = 0

In [3]:
# OpenCV Video Capture
cap = cv2.VideoCapture(0)

# Extract Keypoints Function
def extract_keypoints(results):
    if results.pose_landmarks:
        pose = np.array([[res.x, res.y, res.z] for res in results.pose_landmarks.landmark])
    else:
        pose = np.zeros((33, 3))  # 33 keypoints with (x, y, z)
    return pose.flatten()

# Create or Append to CSV
data = []
labels = []

data_file = "action_data.csv"
if os.path.exists(data_file):
    df_existing = pd.read_csv(data_file)
    data = df_existing.iloc[:, :-1].values.tolist()
    labels = df_existing.iloc[:, -1].tolist()

# Start capturing
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(image)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)

        keypoints = extract_keypoints(results)
        if keypoints.any():
            data.append(keypoints)
            labels.append(actions[current_action_index])
        
        cv2.putText(image, f"Recording: {actions[current_action_index]}", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.imshow("Webcam Feed", image)
        
        key = cv2.waitKey(10) & 0xFF
        if key == ord('q'):
            break
        elif key == ord(' '):  # Press spacebar to switch action
            current_action_index = (current_action_index + 1) % len(actions)
            print(f"Switched to: {actions[current_action_index]}")

# Save to CSV
df = pd.DataFrame(data)
df["label"] = labels
df.to_csv("action_data.csv", index=False)

cap.release()
cv2.destroyAllWindows()


Switched to: No_ball
Switched to: clap


In [4]:
# Load Data for Training
df = pd.read_csv("action_data.csv")
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

# Encode Labels
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)

# Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# LSTM Model
model = Sequential([
    LSTM(64, return_sequences=True, activation='relu', input_shape=(33, 3)),
    LSTM(32, return_sequences=False, activation='relu'),
    Dense(32, activation='relu'),
    Dense(len(actions), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train.reshape(-1, 33, 3), Y_train, epochs=50, validation_data=(X_test.reshape(-1, 33, 3), Y_test))

# Save Model
model.save("lstm_action_model.h5")


Epoch 1/50


  super().__init__(**kwargs)


[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.4772 - loss: 0.9838 - val_accuracy: 0.9893 - val_loss: 0.0501
Epoch 2/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9854 - loss: 0.0654 - val_accuracy: 0.9859 - val_loss: 0.0553
Epoch 3/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9893 - loss: 0.0458 - val_accuracy: 0.9913 - val_loss: 0.0340
Epoch 4/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9864 - loss: 0.0548 - val_accuracy: 0.9953 - val_loss: 0.0241
Epoch 5/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9949 - loss: 0.0233 - val_accuracy: 0.9973 - val_loss: 0.0161
Epoch 6/50
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.9859 - loss: 0.1106 - val_accuracy: 0.3662 - val_loss: 0.9424
Epoch 7/50
[1m187/187[0m [32m━



In [5]:
# Load Model for Prediction
model = tf.keras.models.load_model("lstm_action_model.h5")

# Real-Time Detection
cap = cv2.VideoCapture(0)
sequence = []
sequence_length = 30

with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        results = holistic.process(image)
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)

        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        
        if len(sequence) > sequence_length:
            sequence.pop(0)

        if len(sequence) == sequence_length:
            input_data = np.expand_dims(sequence, axis=0)
            prediction = model.predict(input_data)
            action = actions[np.argmax(prediction)]
            cv2.putText(image, f"Detected: {action}", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        cv2.imshow("Webcam Feed", image)
        
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()




NameError: name 'extract_keypoints' is not defined

In [9]:
import cv2
import numpy as np
import tensorflow as tf
import mediapipe as mp

# Load Model
model = tf.keras.models.load_model("lstm_action_model.h5")

# Define Actions
actions = ["No_action", "No_ball", "clap"]

# OpenCV Video Capture
cap = cv2.VideoCapture(0)

# Real-Time Detection
sequence = []
sequence_length = 30

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    sequence.append(image)
    
    if len(sequence) > sequence_length:
        sequence.pop(0)

    if len(sequence) == sequence_length:
        input_data = np.expand_dims(sequence, axis=0)
        prediction = model.predict(input_data)
        action = actions[np.argmax(prediction)]
        cv2.putText(image, f"Detected: {action}", (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    cv2.imshow("Webcam Feed", image)
    
    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()




ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("sequential_1/Cast:0", shape=(1, 30, 480, 640, 3), dtype=float32). Expected shape (None, 33, 3), but input has incompatible shape (1, 30, 480, 640, 3)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(1, 30, 480, 640, 3), dtype=uint8)
  • training=False
  • mask=None