In [26]:
import cv2
import mediapipe as mp
import numpy as np
import time
from tensorflow.keras.applications import MobileNetV3Small
from tensorflow.keras.layers import Dense, Dropout, Flatten, Concatenate, Input, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [27]:
# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()
mp_drawing = mp.solutions.drawing_utils

def extract_keypoints(image):
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    result = pose.process(image_rgb)
    
    # If keypoints detected
    if result.pose_landmarks:
        keypoints = []
        for landmark in result.pose_landmarks.landmark:
            keypoints.append([landmark.x, landmark.y, landmark.z])
        return np.array(keypoints).flatten()  # Flatten the keypoints array
    return np.zeros(33 * 3)  # If no keypoints, return zero array

In [28]:
# Create dataset directories
pose_classes = ['Adho Mukha Svanasana', 'Phalakasana', 'Utkata Konasana', 'Virabhadrasana II', 'Vrikshasana']
data = []
labels = []
images = []  # To store images for MobileNetV3Small input

In [29]:
import os
print("Current Working Directory:", os.getcwd())

Current Working Directory: C:\Users\hp


In [30]:
dataset_path = r'C:\Users\hp\Downloads\balanced_dataset'
print(os.listdir(dataset_path))

['TEST', 'TRAIN']


In [31]:
test_path = os.path.join(dataset_path, 'TEST')
train_path = os.path.join(dataset_path, 'TRAIN')

In [32]:
print("Contents of TEST directory:", os.listdir(test_path))
print("Contents of TRAIN directory:", os.listdir(train_path))

Contents of TEST directory: ['Adho Mukha Svanasana', 'Phalakasana', 'Utkata Konasana', 'Virabhadrasana II', 'Vrikshasana']
Contents of TRAIN directory: ['Adho Mukha Svanasana', 'Phalakasana', 'Utkata Konasana', 'Virabhadrasana II', 'Vrikshasana']


In [33]:
for pose_class in pose_classes:
    pose_class_path = os.path.join(test_path, pose_class)
    try:
        image_files = os.listdir(pose_class_path)
    except FileNotFoundError:
        print(f"Directory not found: {pose_class_path}")
        continue  # Skip to the next class if directory is not found
    for image_file in image_files:
        image_path = os.path.join(pose_class_path, image_file)
        image = cv2.imread(image_path)
        
        # Resize image to 224x224 for MobileNetV3Small
        image_resized = cv2.resize(image, (224, 224))
        images.append(image_resized)  # Append resized images for MobileNetV3Small

        keypoints = extract_keypoints(image)
        data.append(keypoints)
        labels.append(pose_classes.index(pose_class))

In [34]:
data = np.array(data)
labels = np.array(labels)
images = np.array(images) / 255.0  # Normalize the images

In [35]:
print(f'Data shape: {data.shape}, Labels shape: {labels.shape}, Images shape: {images.shape}')

Data shape: (470, 99), Labels shape: (470,), Images shape: (470, 224, 224, 3)


In [36]:
# Split data into training and test sets
X_train_keypoints, X_test_keypoints, X_train_images, X_test_images, y_train, y_test = train_test_split(
    data, images, labels, test_size=0.2, random_state=42
)

In [37]:
# Load pre-trained MobileNetV3Small without the top layers (for feature extraction)
mobilenetv3 = MobileNetV3Small(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

In [38]:
# Freeze the MobileNetV3Small layers to retain the pre-trained weights
for layer in mobilenetv3.layers:
    layer.trainable = False

In [39]:
# Build the combined model
def build_combined_model(mobilenetv3_model):
    # Input for MobileNetV3Small (image data)
    image_input = mobilenetv3_model.input
    image_features = Flatten()(mobilenetv3_model.output)  # Flatten MobileNetV3Small output
    
    # Input for keypoints
    keypoints_input = Input(shape=(99,))  # 33 keypoints with x, y, z coordinates
    
    # Concatenate image features and keypoints
    combined = Concatenate()([image_features, keypoints_input])
    
    # Fully connected layers with Batch Normalization
    fc1 = Dense(256, activation='relu')(combined)
    fc1 = BatchNormalization()(fc1)
    fc2 = Dense(128, activation='relu')(fc1)
    fc2 = BatchNormalization()(fc2)
    dropout = Dropout(0.5)(fc2)
    
    # Output layer
    output = Dense(len(pose_classes), activation='softmax')(dropout)
    
    # Create the combined model
    model = Model(inputs=[image_input, keypoints_input], outputs=output)
    return model

In [40]:
# Build and compile the model
model = build_combined_model(mobilenetv3)
model.compile(optimizer=Adam(learning_rate=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [41]:
# Summary of the model
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 rescaling_1 (Rescaling)        (None, 224, 224, 3)  0           ['input_3[0][0]']                
                                                                                                  
 Conv (Conv2D)                  (None, 112, 112, 16  432         ['rescaling_1[0][0]']            
                                )                                                                 
                                                                                            

In [42]:
# Callbacks for improving training
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6)

In [43]:
# Train the model using both image and keypoint inputs
history = model.fit(
    [X_train_images, X_train_keypoints],  # Image data and keypoint data
    y_train,
    validation_data=([X_test_images, X_test_keypoints], y_test),
    epochs=50,  # Increased epochs for better learning
    batch_size=32,
    callbacks=[early_stopping, reduce_lr]
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50


In [44]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate([X_test_images, X_test_keypoints], y_test)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

Test Accuracy: 81.91%


In [45]:
# Real-time pose detection with proper inputs
class_labels = ['Adho Mukha Svanasana', 'Phalakasana', 'Utkata Konasana', 'Virabhadrasana II', 'Vrikshasana']
confidence_threshold = 0.5  # Set a threshold for detection confidence

In [46]:
# Start webcam feed
cap = cv2.VideoCapture(0)  # Change '0' to a file path if using a video

if not cap.isOpened():
    print("Error: Could not open webcam.")
    exit()

# Countdown timer for 10 seconds before capturing the image
countdown_seconds = 10
start_time = time.time()

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    # Flip frame for natural webcam view
    frame = cv2.flip(frame, 1)

    # Calculate remaining countdown time
    elapsed_time = time.time() - start_time
    remaining_time = max(0, countdown_seconds - int(elapsed_time))

    # Display countdown timer on the screen
    if remaining_time > 0:
        cv2.putText(frame, f"Capturing in: {remaining_time}s", 
                    (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 
                    1, (0, 255, 255), 2)
        cv2.imshow('Yoga Pose Detection', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
        continue  # Wait until countdown ends

    # Capture the frame after 10 seconds
    captured_frame = frame.copy()

    # Resize frame for model input
    input_frame_image = cv2.resize(captured_frame, (224, 224)) / 255.0
    input_frame_image = np.expand_dims(input_frame_image, axis=0)  # Normalize and add batch dimension

    # Extract keypoints
    input_frame_keypoints = extract_keypoints(captured_frame).reshape(1, -1)

    # Predict pose
    predictions = model.predict([input_frame_image, input_frame_keypoints])
    predicted_class_index = np.argmax(predictions)
    confidence = np.max(predictions)

    # Check confidence and display result
    if confidence > confidence_threshold:
        predicted_class = class_labels[predicted_class_index]
    else:
        predicted_class = "Unknown"

    # Display the prediction on the captured frame
    cv2.putText(captured_frame, f"Pose: {predicted_class} ({confidence:.2f})", 
                (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 
                1, (255, 0, 0), 2)

    cv2.imshow('Yoga Pose Detection', captured_frame)

    # Allow user to exit or restart the detection
    if cv2.waitKey(0) & 0xFF == ord('q'):
        break
    else:
        start_time = time.time()  # Restart countdown

# Release resources
cap.release()
cv2.destroyAllWindows()

