# SignSpeak: Sign Language Data Exploration

This notebook explores sign language datasets and extracts key features for training the sign recognition model.

In [None]:
# Import libraries
import os
import sys
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mediapipe as mp
from tqdm.notebook import tqdm

# Add src directory to path
sys.path.append(os.path.abspath('../src'))

# Import SignSpeak modules
from sign_recognition.detector import SignDetector

## Setting Up MediaPipe

We'll use MediaPipe Holistic for pose, face, and hand landmark detection.

In [None]:
# Initialize detector
detector = SignDetector()

# Test with a sample image
sample_image = np.zeros((480, 640, 3), dtype=np.uint8)
plt.figure(figsize=(10, 6))
plt.imshow(cv2.cvtColor(sample_image, cv2.COLOR_BGR2RGB))
plt.title('Sample Black Image')
plt.axis('off')
plt.show()

## Loading and Exploring Sign Language Datasets

In this section, we would download and explore sign language datasets. For this example, let's imagine we have access to a dataset of sign language videos.

In [None]:
# Define helper function to list available datasets
def list_available_datasets():
    """List available sign language datasets."""
    datasets = {
        "ASLLVD": {
            "description": "American Sign Language Lexicon Video Dataset",
            "url": "https://www.bu.edu/asllrp/av/dai-asllvd.html",
            "size": "9,794 videos of 3,000+ signs"
        },
        "MSASL": {
            "description": "Microsoft American Sign Language Dataset",
            "url": "https://www.microsoft.com/en-us/download/details.aspx?id=100121",
            "size": "25,000+ videos of 1,000 signs"
        },
        "WLASL": {
            "description": "Word-Level American Sign Language Dataset",
            "url": "https://github.com/dxli94/WLASL",
            "size": "21,000+ videos of 2,000 signs"
        }
    }
    
    print("Available Sign Language Datasets:")
    for name, info in datasets.items():
        print(f"\n{name}:")
        print(f"  Description: {info['description']}")
        print(f"  URL: {info['url']}")
        print(f"  Size: {info['size']}")
    
    return datasets

# List available datasets
available_datasets = list_available_datasets()

## Processing Video Data

Here we'll create a function to process video files and extract landmarks using MediaPipe.

In [None]:
def process_video(video_path, max_frames=None):
    """Process a video file and extract landmarks using MediaPipe.
    
    Args:
        video_path: Path to the video file
        max_frames: Maximum number of frames to process (None = all frames)
        
    Returns:
        A list of keypoint dictionaries (one per frame)
    """
    # Create video capture object
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Could not open video file {video_path}")
        return None
    
    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps
    
    print(f"Video info: {frame_count} frames, {fps:.2f} FPS, {duration:.2f} seconds")
    
    # Limit frames if specified
    if max_frames is not None and max_frames < frame_count:
        frame_count = max_frames
    
    # Process frames
    keypoints_sequence = []
    for i in tqdm(range(frame_count)):
        ret, frame = cap.read()
        if not ret:
            break
        
        # Process frame using our detector
        _, keypoints = detector.process_frame(frame)
        
        # Store keypoints
        if keypoints is not None:
            keypoints_sequence.append(keypoints)
        else:
            # If no keypoints detected, store an empty dict
            keypoints_sequence.append({})
    
    # Release video capture
    cap.release()
    
    return keypoints_sequence

# For demo purposes, we'll just print a message since we don't have a video file available
print("To process a video, call process_video(video_path)")
print("For example: keypoints = process_video('path/to/sign_video.mp4', max_frames=100)")

## Data Visualization

Let's create functions to visualize the extracted keypoints.

In [None]:
def visualize_keypoints(keypoints, figsize=(15, 10)):
    """Visualize keypoints in 3D space.
    
    Args:
        keypoints: Dictionary of keypoints from the SignDetector
        figsize: Figure size (width, height)
    """
    if not keypoints or not any(keypoints.values()):
        print("No keypoints to visualize.")
        return
    
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot keypoints by type
    colors = {
        'face': 'green',
        'pose': 'blue',
        'left_hand': 'red',
        'right_hand': 'purple'
    }
    
    for part_name, part_keypoints in keypoints.items():
        if part_name in colors and np.any(part_keypoints):
            x = part_keypoints[:, 0]
            y = part_keypoints[:, 1]
            z = part_keypoints[:, 2]
            ax.scatter(x, y, z, c=colors[part_name], marker='o', label=part_name)
    
    # Set labels and title
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title('Keypoints Visualization')
    
    # Set axis limits (MediaPipe gives normalized coordinates)
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_zlim(-0.2, 0.2)
    
    # Add legend
    ax.legend()
    
    plt.show()

# Example keypoints for visualization (placeholder data)
example_keypoints = {
    'face': np.array([[0.5, 0.2, 0]] * 10),  # Face keypoints
    'pose': np.array([[0.5, 0.5, 0]] * 10),  # Pose keypoints
    'left_hand': np.array([[0.3, 0.6, 0]] * 10),  # Left hand keypoints
    'right_hand': np.array([[0.7, 0.6, 0]] * 10)  # Right hand keypoints
}

# Visualize example keypoints
visualize_keypoints(example_keypoints)

## Feature Extraction for Model Training

Let's implement functions to extract features for training our sign recognition model.

In [None]:
def extract_features(keypoints_sequence):
    """Extract features from keypoints sequence for model training.
    
    Args:
        keypoints_sequence: List of keypoint dictionaries
        
    Returns:
        Numpy array of features
    """
    if not keypoints_sequence:
        return None
    
    # Initialize feature list
    features = []
    
    # Process each frame in the sequence
    for keypoints in keypoints_sequence:
        if not keypoints:  # Skip empty frames
            continue
        
        # Extract features from each part
        frame_features = []
        
        # Process hands (most important for sign language)
        for hand in ['left_hand', 'right_hand']:
            if hand in keypoints and np.any(keypoints[hand]):
                # Flatten 3D coordinates to 1D array
                hand_features = keypoints[hand].flatten()
                frame_features.append(hand_features)
            else:
                # If hand not detected, use zeros
                frame_features.append(np.zeros(21 * 3))
        
        # Process pose (smaller subset of pose landmarks relevant to signing)
        if 'pose' in keypoints and np.any(keypoints['pose']):
            # Extract only upper body landmarks (indexes 0-11 typically)
            upper_body = keypoints['pose'][:12].flatten()
            frame_features.append(upper_body)
        else:
            frame_features.append(np.zeros(12 * 3))
        
        # Concatenate all features for this frame
        frame_features = np.concatenate(frame_features)
        features.append(frame_features)
    
    # Convert to numpy array
    features = np.array(features)
    
    return features

# Example usage (with our placeholder data)
example_sequence = [example_keypoints] * 5  # 5 frames of the same keypoints
features = extract_features(example_sequence)

# Print feature shape
if features is not None:
    print(f"Feature shape: {features.shape}")
    print(f"This represents {features.shape[0]} frames, each with {features.shape[1]} features.")

## Next Steps

After exploring the data and extracting features, the next steps would be:

1. **Data Collection**: Gather or download sign language datasets
2. **Data Preprocessing**: Process videos to extract keypoints
3. **Feature Engineering**: Extract relevant features from keypoints
4. **Model Training**: Train a sign recognition model (e.g., LSTM, GRU, or Transformer)
5. **Model Evaluation**: Evaluate model performance on validation data
6. **Model Integration**: Integrate the trained model into the SignSpeak application

These steps will be covered in separate notebooks.