In [3]:
import os
import cv2
import numpy as np
import mediapipe as mp
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard

In [4]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

In [5]:
def mediapipe_detection(image, model):
    """Convert BGR to RGB and make detection"""
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    return image, results


In [6]:
def draw_landmarks(image, results):
    """Draw landmark connections"""
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS)
    
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    
    # Draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

In [36]:
def extract_keypoints(results):
    """Extract keypoints from MediaPipe results"""
    # Extract face landmarks
    # face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(1404)
    
    # Extract pose landmarks
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(132)
    
    # Extract left hand landmarks
    left_hand = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(63)
    
    # Extract right hand landmarks
    right_hand = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(63)
    return np.concatenate([pose, left_hand, right_hand])


In [20]:
def extract_frames_from_videos(input_folder, output_folder, sequence_length=30):
    """
    Extract frames from videos in input folder and save keypoints
    
    Args:
    input_folder (str): Folder containing subfolders with videos
    output_folder (str): Folder to save extracted keypoints
    sequence_length (int): Number of frames to extract from each video
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Initialize MediaPipe Holistic model
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        # Iterate through sign language action folders
        for action in os.listdir(input_folder):
            action_path = os.path.join(input_folder, action)
            
            # Create output action folder
            output_action_path = os.path.join(output_folder, action)
            os.makedirs(output_action_path, exist_ok=True)
            
            # Find video in the action folder
            for video_file in os.listdir(action_path):
                if video_file.endswith(('.mp4', '.avi', '.mov')):
                    video_path = os.path.join(action_path, video_file)
                    
                    # Open video
                    cap = cv2.VideoCapture(video_path)
                    
                    # Print video details
                    print(f"\nProcessing Video:")
                    print(f"Action: {action}")
                    print(f"Video File: {video_file}")
                    
                    # Extract frames at regular intervals
                    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                    fps = cap.get(cv2.CAP_PROP_FPS)
                    video_duration = total_frames / fps
                    
                    print(f"Total Frames: {total_frames}")
                    print(f"FPS: {fps}")
                    print(f"Video Duration: {video_duration:.2f} seconds")
                    
                    step = max(1, total_frames // sequence_length)
                    print(f"Extracting {sequence_length} frames at intervals of {step} frames")
                    
                    extracted_frames = 0
                    for frame_num in range(0, total_frames, step):
                        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
                        ret, frame = cap.read()
                        
                        if not ret:
                            break
                        
                        # Perform detection
                        image, results = mediapipe_detection(frame, holistic)
                        
                        # Draw action and frame info on image
                        cv2.putText(image, f"Action: {action}", (10, 30), 
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                        cv2.putText(image, f"Video: {video_file}", (10, 60), 
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                        cv2.putText(image, f"Frame: {frame_num}/{total_frames}", (10, 90), 
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
                        
                        # Draw landmarks
                        draw_landmarks(image, results)
                        
                        # Extract keypoints
                        keypoints = extract_keypoints(results)
                        
                        # Save keypoints
                        np.save(os.path.join(output_action_path, f'frame_{frame_num}.npy'), keypoints)
                        
                        # Show image
                        cv2.imshow('Frame Extraction', image)
                        
                        extracted_frames += 1
                        
                        # Wait and check for quit
                        if cv2.waitKey(1) & 0xFF == ord('q'):
                            break
                    
                    print(f"Extracted {extracted_frames} frames")
                    
                    cap.release()
                    
                    # Stop after processing first video in each action folder
                    break
    
    # Close all OpenCV windows
    cv2.destroyAllWindows()

In [43]:
def prepare_data_for_lstm(data_path, sequence_length=30, num_features=258):
    """
    Prepare data for LSTM model

    Args:
    data_path (str): Path to extracted keypoints
    sequence_length (int): Number of frames in each sequence
    num_features (int): Number of features per frame

    Returns:
    X (numpy array): Sequences of keypoints
    y (numpy array): Corresponding labels (one-hot encoded)
    """
    X = []
    y = []
    
    # Get action labels (each subfolder represents an action)
    actions = sorted(os.listdir(data_path))  # Sort to maintain order
    
    for action_idx, action in enumerate(actions):
        action_path = os.path.join(data_path, action)
        
        # Get all frames for the action
        frame_files = sorted(os.listdir(action_path))  # Ensure proper frame order
        sequences = []
        
        # Create sequences of `sequence_length` frames
        for i in range(len(frame_files) - sequence_length + 1):
            sequence = []
            for j in range(sequence_length):
                frame_path = os.path.join(action_path, frame_files[i + j])
                frame_data = np.load(frame_path)  # Shape should be (258,)
                
                if frame_data.shape != (num_features,):  # Ensure correct shape
                    print(f"Skipping {frame_path}, incorrect shape: {frame_data.shape}")
                    continue
                
                sequence.append(frame_data)
            
            if len(sequence) == sequence_length:  # Ensure full sequence
                sequences.append(sequence)

        if sequences:  # If valid sequences were found
            X.extend(sequences)
            y.extend([action_idx] * len(sequences))

    # Convert to NumPy arrays
    X = np.array(X, dtype=np.float32)  # Shape: (num_samples, 30, 258)
    y = to_categorical(y, num_classes=len(actions))  # One-hot encode labels
    
    print(f"Final X shape: {X.shape}, Final y shape: {y.shape}")
    return X, y

In [40]:
def create_lstm_model(num_actions, sequence_length=30, num_features=258):
    """
    Create LSTM model for sign language recognition
    
    Args:
    num_actions (int): Number of different sign actions
    sequence_length (int): Number of frames in each sequence
    num_features (int): Number of features per frame
    
    Returns:
    Compiled Keras model
    """
    model = Sequential([
        LSTM(64, return_sequences=True, input_shape=(30, 258)),
        LSTM(128),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(num_actions, activation='softmax')
    ])
    
    model.compile(
        optimizer='Adam', 
        loss='categorical_crossentropy', 
        metrics=['categorical_accuracy']
    )
    
    return model

In [52]:

# Load trained model
model = tf.keras.models.load_model('sign_language.keras')

# Get action labels
actions = os.listdir('np')

# Initialize MediaPipe Holistic model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    # Open webcam
    cap = cv2.VideoCapture(0)
    
    # Sequence collection
    sequence = []
    sentence = []
    predictions = []
    threshold = 0.5
    
    while True:
        # Read frame
        ret, frame = cap.read()
        
        # Make detection
        image, results = mediapipe_detection(frame, holistic)
        
        # Draw landmarks
        draw_landmarks(image, results)
        
        # Prediction logic
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]
        
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            predictions.append(np.argmax(res))
            
            # Smooth predictions
            if np.unique(predictions[-10:])[0] == np.argmax(res):
                if res[np.argmax(res)] > threshold:
                    # Add action to sentence if consistent
                    if len(sentence) > 0:
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])
            
            # Limit sentence length
            if len(sentence) > 5:
                sentence = sentence[-5:]
        
        # Visualize predictions
        cv2.rectangle(image, (0,0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3,30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Show image
        cv2.imshow('Sign Language Recognition', image)
        
        # Break loop
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
    
    # Release resources
    cap.release()
    cv2.destroyAllWindows()


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16m

KeyboardInterrupt: 

In [49]:

def main():
    # Set paths
    input_videos_path = 'croppedcompressed'
    keypoints_path = 'np'
    
    # Extract frames and keypoints from videos
    # extract_frames_from_videos(input_videos_path, keypoints_path)
    
    # Prepare data for training
    X_train, y_train = prepare_data_for_lstm(keypoints_path)
    
    # Split data
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Create and train model
    model = create_lstm_model(len(os.listdir(keypoints_path)))
    model.fit(X_train, y_train, epochs=80)
    
    # Save model
    model.save('sign_language.keras')

if __name__ == "__main__":
    main()


Final X shape: (2256, 30, 258), Final y shape: (2256, 327)
Epoch 1/80
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - categorical_accuracy: 0.0043 - loss: 5.7603
Epoch 2/80
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - categorical_accuracy: 0.0151 - loss: 5.4100
Epoch 3/80
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - categorical_accuracy: 0.0293 - loss: 4.8753
Epoch 4/80
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - categorical_accuracy: 0.0317 - loss: 4.4085
Epoch 5/80
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - categorical_accuracy: 0.0495 - loss: 4.0822
Epoch 6/80
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - categorical_accuracy: 0.0769 - loss: 3.8073
Epoch 7/80
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - categorical_accuracy: 0.0745 - loss: 3.6764
Epoch 8/80
[1m71/71[0m

In [45]:
model.save('sign_language_model.h5')

NameError: name 'model' is not defined

In [54]:
cap.release()
cv2.destroyAllWindows()


In [42]:
import numpy as np

file_path = "np/3G/frame_16.npy"

# Load the .npy file
data = np.load(file_path)

# Print the shape
print("Shape of the .npy file:", data.shape)


Shape of the .npy file: (258,)
