# Hand Gesture Classification Using MediaPipe & HaGRID


## Project Overview

This project focuses on the classification of hand gestures using landmark data generated by **MediaPipe** from the **HaGRID (Hand Gesture Recognition Image Dataset)**. By utilizing the spatial coordinates of hand keypoints, we aim to train a machine learning model capable of accurately identifying various gestures. For experiment tracking and reproducibility, we will leverage **MLflow** to log parameters, metrics, and artifacts throughout the model development process. This will allow us to systematically evaluate different models and configurations, ultimately selecting the best-performing model for deployment.

## Import Required Packages

In [None]:
import cv2
import numpy as np
import pandas as pd
import seaborn as sns
import mediapipe as mp
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, ParameterGrid, train_test_split

import mlflow
from mlflow.data.pandas_dataset import from_pandas
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Hand Gesture Classification")

import warnings
warnings.filterwarnings('ignore')

## Data Loading & Exploration

In [None]:
df = pd.read_csv("data\hand_landmarks_data.csv")
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
plt.figure(figsize=(17,6))
sns.countplot(data=df, x='label', hue='label')
plt.xticks(rotation=30)
plt.show()

So far we don't have any missing values or inconsistency in data types, our data contains 25675 rows and the data is quite unbalanced (we have shortage in fist & mute labels especially).

## Data Visualization

In [None]:
def plot_hand_landmarks(sample, ax=None):
    """
    Plot hand landmarks for a single sample.
    
    Parameters:
    -----------
    sample : pd.Series or dict
        A row from the dataframe containing x, y, z coordinates for 21 landmarks
    ax : matplotlib axis, optional
        Axis to plot on. If None, creates a new figure
    
    Returns:
    --------
    ax : matplotlib axis
        The axis with the plotted hand landmarks
    """
    # Extract x, y coordinates for all 21 landmarks
    x_coords = [sample[f'x{i}'] for i in range(1, 22)]
    y_coords = [sample[f'y{i}'] for i in range(1, 22)]
    
    # Create figure if not provided
    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
    
    # MediaPipe hand landmark connections
    # Each tuple represents a connection between two landmark indices
    connections = [
        # Thumb
        (0, 1), (1, 2), (2, 3), (3, 4),
        # Index finger
        (0, 5), (5, 6), (6, 7), (7, 8),
        # Middle finger
        (0, 9), (9, 10), (10, 11), (11, 12),
        # Ring finger
        (0, 13), (13, 14), (14, 15), (15, 16),
        # Pinky
        (0, 17), (17, 18), (18, 19), (19, 20),
        # Palm
        (5, 9), (9, 13), (13, 17)
    ]
    
    # Plot connections (lines between landmarks)
    for connection in connections:
        start_idx, end_idx = connection
        ax.plot([x_coords[start_idx], x_coords[end_idx]], 
                [y_coords[start_idx], y_coords[end_idx]], 
                'b-', linewidth=2, alpha=0.6)
    
    # Plot landmarks (points)
    ax.scatter(x_coords, y_coords, c='red', s=50, zorder=3)
    
    # Add landmark numbers
    for i, (x, y) in enumerate(zip(x_coords, y_coords)):
        ax.annotate(str(i), (x, y), fontsize=8, ha='center', 
                   bbox=dict(boxstyle='circle,pad=0.1', facecolor='yellow', alpha=0.5))
    
    # Invert y-axis (image coordinates start from top-left)
    ax.invert_yaxis()
    
    # Set labels and title
    ax.set_xlabel('X Coordinate', fontsize=12)
    ax.set_ylabel('Y Coordinate', fontsize=12)
    ax.set_title(f'Hand Gesture: {sample["label"]}', fontsize=14, fontweight='bold')
    ax.grid(True, alpha=0.3)
    ax.set_aspect('equal')
    
    return ax

In [None]:
def plot_multiple_gestures(df, n_samples=6, random_state=42):
    """
    Plot multiple hand gesture samples for visual inspection.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing hand landmark data
    n_samples : int
        Number of samples to plot (default: 6)
    random_state : int
        Random seed for reproducibility
    """
    # Sample random gestures
    samples = df.sample(n=n_samples, random_state=random_state)
    
    # Calculate grid dimensions
    n_cols = 3
    n_rows = (n_samples + n_cols - 1) // n_cols
    
    # Create subplots
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    axes = axes.flatten() if n_samples > 1 else [axes]
    
    # Plot each sample
    for idx, (_, sample) in enumerate(samples.iterrows()):
        plot_hand_landmarks(sample, ax=axes[idx])
    
    # Hide extra subplots if any
    for idx in range(n_samples, len(axes)):
        axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
def plot_gestures_by_label(df, labels=None, samples_per_label=3, random_state=42):
    """
    Plot hand gesture samples grouped by label for comparison.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing hand landmark data
    labels : list, optional
        List of gesture labels to plot. If None, plots all unique labels
    samples_per_label : int
        Number of samples to show per label (default: 3)
    random_state : int
        Random seed for reproducibility
    """
    # Get labels to plot
    if labels is None:
        labels = df['label'].unique()[:5]  # Limit to 5 labels for readability
    
    n_labels = len(labels)
    
    # Create subplots
    fig, axes = plt.subplots(n_labels, samples_per_label, 
                             figsize=(5 * samples_per_label, 5 * n_labels))
    
    # Handle single row/column case
    if n_labels == 1:
        axes = axes.reshape(1, -1)
    elif samples_per_label == 1:
        axes = axes.reshape(-1, 1)
    
    # Plot samples for each label
    for row_idx, label in enumerate(labels):
        # Get samples for this label
        label_samples = df[df['label'] == label].sample(
            n=min(samples_per_label, len(df[df['label'] == label])), 
            random_state=random_state
        )
        
        for col_idx, (_, sample) in enumerate(label_samples.iterrows()):
            if col_idx < samples_per_label:
                plot_hand_landmarks(sample, ax=axes[row_idx, col_idx])
    
    plt.suptitle('Hand Gestures Grouped by Label', fontsize=16, fontweight='bold', y=1.001)
    plt.tight_layout()
    plt.show()

In [None]:
plot_multiple_gestures(df, n_samples=6)

In [None]:
plot_gestures_by_label(df, labels=['call', 'peace', 'fist'], samples_per_label=3)

## Data Preprocessing

Before applying any operations on the dataset we are going to split the data to 80% training and 20% validation and the testing will be done real time on the output video this way we are preventing any data leakage risks.

Since we are having quite unbalanced data so it is prefered to use stratified sampling during splitting the data.

Additionally, the data should be shuffled before splitting since consecutive samples belong to the same gesture class.

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

The detected hands have different scales and positions in the image. To overcome this problem recenter the hand landmarks (x,y) to make the origin the wrist point and divide all the landmarks by the mid-finger tip position.

We are going to create a custom sklearn class that transfrom our data by subtracting the wrist coordinates from all other landmark coordinates and scale them by the position of the middle finger tip.

In [None]:
class LandmarkNormalizer(BaseEstimator, TransformerMixin):
    """
    Custom transformer to normalize MediaPipe hand landmarks.
    - Recenters (x, y) coordinates to the wrist (Landmark 0).
    - Scales landmarks by the distance to the middle finger tip (Landmark 12).
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # No parameters to estimate here
        return self

    def transform(self, X):
        X_copy = np.copy(X)
        
        for i in range(len(X_copy)):
            # Reshape to (21, 3) for easier indexing: [landmark_idx, coordinate]
            landmarks = X_copy[i].reshape(21, 3)
            
            # Recenter: Subtract wrist (0) coordinates from all (x, y) 
            wrist = landmarks[0, :2] # Only x and y
            landmarks[:, :2] -= wrist
            
            # Scale: Divide by mid-finger tip (12) position
            scale_factor = np.linalg.norm(landmarks[12, :2])
            landmarks[:, :2] /= scale_factor
                
            # Flatten back to the original shape
            X_copy[i] = landmarks.flatten()
            
        return X_copy

Now we are going to test this transformer and create a new dataframe using the normalized data to visualize the landmarks after normalization and make sure very thing is working as expected.

In [None]:
normalizer = LandmarkNormalizer()
X_train_normalized = normalizer.fit_transform(X_train)

In [None]:
df_normalized = pd.DataFrame(data=np.column_stack([X_train_normalized, y_train]), columns=df.columns)

In [None]:
plot_multiple_gestures(df_normalized, n_samples=6)

In [None]:
plot_gestures_by_label(df_normalized, labels=['call', 'peace', 'fist'], samples_per_label=3)

As you see the wrist landmark is at zero and all other landmarks are correctly scaled without changing the aspect ratio of the hand this makes our model scale and translation invariant.

## Model Training & Evaluation

In this section, we are going to train three different models using k-fold cross validation and grid search for hyperparameter tuning:

1. **K-Nearest Neighbors (KNN)**: A distance-based classifier that predicts based on the majority class of the k nearest neighbors.

2. **Logistic Regression**: A linear model that uses a logistic function to model the probability of each gesture class.

3. **Random Forest**: An ensemble learning method that constructs multiple decision trees and outputs the mode of their predictions.

For each model, we will:
- Perform grid search to find the best hyperparameters
- Use 5-fold cross-validation to evaluate performance during training
- Test the best model on the hold-out test set
- Visualize results using confusion matrices and classification metrics

For tracking our experiments, we will utilize **MLflow** to log parameters, metrics, and artifacts for each run. This will allow us to compare different models and hyperparameter configurations effectively. In the end we will analyze the results and choose the best performing model depending on f1-score and we will register it in the **MLflow Model Registry** and move it to the "production" stage.

In [None]:
X_test_normalized = normalizer.transform(X_test)

In [None]:
# Create separate datasets for training and testing to log correctly in MLflow
df_train = pd.DataFrame(data=np.column_stack([X_train, y_train]), columns=df.columns)
df_test = pd.DataFrame(data=np.column_stack([X_test, y_test]), columns=df.columns)

train_dataset = from_pandas(df_train, source="data/hand_landmarks_data.csv", name="hand_landmarks_train")
test_dataset = from_pandas(df_test, source="data/hand_landmarks_data.csv", name="hand_landmarks_test")

### K-Nearest Neighbors Classifier

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

run_no = 1

for param in ParameterGrid(param_grid):
    with mlflow.start_run(run_name=f"KNN_{run_no}"):

        knn = KNeighborsClassifier(**param)
        scores = cross_val_score(knn, X_train_normalized, y_train, cv=5)

        mean_score = np.mean(scores)

        mlflow.log_params(param)
        mlflow.log_metric("mean_cv_accuracy", mean_score)
        mlflow.log_input(train_dataset, context="training")

        run_no += 1

experiment = mlflow.get_experiment_by_name("Hand Gesture Classification")

runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id], 
    order_by=["metrics.mean_cv_accuracy DESC"], 
    filter_string="tags.mlflow.runName LIKE 'KNN_%'"
)

best_run = runs.iloc[0]
best_params = {
    'n_neighbors': int(best_run['params.n_neighbors']),
    'weights': best_run['params.weights'],
    'metric': best_run['params.metric']
}
best_score = best_run['metrics.mean_cv_accuracy']

print(f"Best Params: {best_params}, Best CV Accuracy: {best_score:.4f}")

In [None]:
with mlflow.start_run(run_name="Best_KNN_Model_Evaluation"):
    knn = KNeighborsClassifier(**best_params)
    best_model = knn.fit(X_train_normalized, y_train)
    preds = best_model.predict(X_test_normalized)

    report = classification_report(y_test, preds, output_dict=True)

    mlflow.log_params(best_params)
    mlflow.log_metric("accuracy", report['accuracy'])
    mlflow.log_metric("precision", report['weighted avg']['precision'])
    mlflow.log_metric("recall", report['weighted avg']['recall'])
    mlflow.log_metric("f1_score", report['weighted avg']['f1-score'])
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")

    print(f"Accuracy: {report['accuracy']:.4f}")
    print(f"Precision: {report['weighted avg']['precision']:.4f}")
    print(f"Recall: {report['weighted avg']['recall']:.4f}")
    print(f"F1-score: {report['weighted avg']['f1-score']:.4f}")

    cm = confusion_matrix(y_test, preds)
    plt.figure(figsize=(16, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=best_model.classes_, yticklabels=best_model.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix - KNN')
    plt.savefig("confusion_matrix_knn.png")
    plt.show()

    mlflow.log_artifact("confusion_matrix_knn.png")

### Logistic Regression Classifier

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2', None],
    'max_iter': [500, 1000]
}

run_no = 1

for param in ParameterGrid(param_grid):
    with mlflow.start_run(run_name=f"LogisticRegression_{run_no}"):

        lr = LogisticRegression(**param, random_state=42)
        scores = cross_val_score(lr, X_train_normalized, y_train, cv=5)

        mean_score = np.mean(scores)

        mlflow.log_params(param)
        mlflow.log_metric("mean_cv_accuracy", mean_score)
        mlflow.log_input(train_dataset, context="training")

        run_no += 1

experiment = mlflow.get_experiment_by_name("Hand Gesture Classification")

runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id], 
    order_by=["metrics.mean_cv_accuracy DESC"], 
    filter_string="tags.mlflow.runName LIKE 'LogisticRegression_%'"
)

best_run = runs.iloc[0]
best_params = {
    'C': float(best_run['params.C']),
    'penalty': best_run['params.penalty'] if best_run['params.penalty'] != 'None' else None,
    'max_iter': int(best_run['params.max_iter'])
}
best_score = best_run['metrics.mean_cv_accuracy']

print(f"Best Params: {best_params}, Best CV Accuracy: {best_score:.4f}")

In [None]:
with mlflow.start_run(run_name="Best_LogisticRegression_Model_Evaluation"):
    lr = LogisticRegression(**best_params, random_state=42)
    best_model = lr.fit(X_train_normalized, y_train)
    preds = best_model.predict(X_test_normalized)

    report = classification_report(y_test, preds, output_dict=True)

    mlflow.log_params(best_params)
    mlflow.log_metric("accuracy", report['accuracy'])
    mlflow.log_metric("precision", report['weighted avg']['precision'])
    mlflow.log_metric("recall", report['weighted avg']['recall'])
    mlflow.log_metric("f1_score", report['weighted avg']['f1-score'])
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")

    print(f"Accuracy: {report['accuracy']:.4f}")
    print(f"Precision: {report['weighted avg']['precision']:.4f}")
    print(f"Recall: {report['weighted avg']['recall']:.4f}")
    print(f"F1-score: {report['weighted avg']['f1-score']:.4f}")

    cm = confusion_matrix(y_test, preds)
    plt.figure(figsize=(16, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=best_model.classes_, yticklabels=best_model.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix - Logistic Regression')
    plt.savefig("confusion_matrix_lr.png")
    plt.show()

    mlflow.log_artifact("confusion_matrix_lr.png")

### Random Forest Classifier

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

run_no = 1

for param in ParameterGrid(param_grid):
    with mlflow.start_run(run_name=f"RandomForest_{run_no}"):

        rf = RandomForestClassifier(**param, random_state=42)
        scores = cross_val_score(rf, X_train_normalized, y_train, cv=5)

        mean_score = np.mean(scores)

        mlflow.log_params(param)
        mlflow.log_metric("mean_cv_accuracy", mean_score)
        mlflow.log_input(train_dataset, context="training")

        run_no += 1

experiment = mlflow.get_experiment_by_name("Hand Gesture Classification")

runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id], 
    order_by=["metrics.mean_cv_accuracy DESC"], 
    filter_string="tags.mlflow.runName LIKE 'RandomForest_%'"
)

best_run = runs.iloc[0]
best_params = {
    'n_estimators': int(best_run['params.n_estimators']),
    'max_depth': int(best_run['params.max_depth']) if best_run['params.max_depth'] != 'None' else None,
    'min_samples_split': int(best_run['params.min_samples_split'])
}
best_score = best_run['metrics.mean_cv_accuracy']

print(f"Best Params: {best_params}, Best CV Accuracy: {best_score:.4f}")

In [None]:
with mlflow.start_run(run_name="Best_RandomForest_Model_Evaluation"):
    best_model = RandomForestClassifier(**best_params, random_state=42)
    best_model.fit(X_train_normalized, y_train)
    preds = best_model.predict(X_test_normalized)

    report = classification_report(y_test, preds, output_dict=True)

    mlflow.log_params(best_params)
    mlflow.log_metric("accuracy", report['accuracy'])
    mlflow.log_metric("precision", report['weighted avg']['precision'])
    mlflow.log_metric("recall", report['weighted avg']['recall'])
    mlflow.log_metric("f1_score", report['weighted avg']['f1-score'])
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(test_dataset, context="testing")

    print(f"Accuracy: {report['accuracy']:.4f}")
    print(f"Precision: {report['weighted avg']['precision']:.4f}")
    print(f"Recall: {report['weighted avg']['recall']:.4f}")
    print(f"F1-score: {report['weighted avg']['f1-score']:.4f}")

    cm = confusion_matrix(y_test, preds)
    plt.figure(figsize=(16, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=best_model.classes_, yticklabels=best_model.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix - Random Forest')
    plt.savefig("confusion_matrix_rf.png")
    plt.show()

    mlflow.log_artifact("confusion_matrix_rf.png")

## Final Model Pipeline

Now we are going to select the best performing model based on the evaluation metrics among all the models that were run in the experiment. The best model is determined by the highest F1-score achieved during evaluation.

In [None]:
experiment = mlflow.get_experiment_by_name("Hand Gesture Classification")

runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id], 
    order_by=["metrics.f1_score DESC"], 
    filter_string="tags.mlflow.runName LIKE '%_Evaluation'"
)

best_run = runs.iloc[0]
best_model_name = best_run['tags.mlflow.runName'].split('_')[1]
best_f1_score = best_run['metrics.f1_score']
print(f"Best Model: {best_model_name}, Best F1-score: {best_f1_score:.4f}")

Based on the evaluation results, the **Random Forest Classifier** demonstrated the best performance. We'll now construct a complete pipeline that integrates the landmark normalization preprocessing step with the best random forest model, and train it on the entire dataset (combining training and test sets) to maximize the use of available data for the final production model.

In [None]:
# Create a pipeline with the normalizer and the best random forest model
final_pipeline = Pipeline([
    ('normalizer', LandmarkNormalizer()),
    ('rf', RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        random_state=42
    ))
])

# Combine training and test data for final training
X_full = pd.concat([X_train, X_test], axis=0)
y_full = pd.concat([y_train, y_test], axis=0)

print(f"Training final model on complete dataset: {X_full.shape[0]} samples")

# Train the pipeline on the full dataset
final_pipeline.fit(X_full, y_full)

print("\nFinal model training complete!")

## Model Inference

In [None]:
# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

In [None]:
def extract_landmarks(hand_landmarks):
    """
    Extract hand landmarks from MediaPipe results and convert to flat array.
    
    Parameters:
    -----------
    hand_landmarks : mediapipe hand landmarks
        Hand landmarks from MediaPipe detection
        
    Returns:
    --------
    landmarks_array : numpy array
        Flattened array of 63 values (21 landmarks Ã— 3 coordinates)
    """
    landmarks = []
    for landmark in hand_landmarks.landmark:
        landmarks.extend([landmark.x, landmark.y, landmark.z])
    return np.array(landmarks).reshape(1, -1)

In [None]:
# Start video capture
cap = cv2.VideoCapture(0)

# Get video properties for saving
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS)) or 20  # Default to 20 if fps is 0

# Create output filename with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"output_videos/hand_gesture_recognition_{timestamp}.mp4"

# Initialize video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

print(f"Video will be saved to: {output_path}")

# Initialize MediaPipe Hands with confidence thresholds
with mp_hands.Hands(model_complexity=0, min_detection_confidence=0.5, min_tracking_confidence=0.5, max_num_hands=2) as hands:
    print("Starting real-time hand gesture recognition...")
    print("Press 'q' to quit")
    
    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            print("Failed to grab frame")
            break
        
        # Flip the frame horizontally for a selfie-view display
        frame = cv2.flip(frame, 1)
        
        # Convert the BGR image to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Process the frame with MediaPipe
        results = hands.process(frame_rgb)
        
        # Draw hand landmarks and predict gestures
        if results.multi_hand_landmarks:
            for hand_idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
                # Draw hand landmarks on the frame
                mp_drawing.draw_landmarks(
                    frame,
                    hand_landmarks,
                    mp_hands.HAND_CONNECTIONS,
                    mp_drawing_styles.get_default_hand_landmarks_style(),
                    mp_drawing_styles.get_default_hand_connections_style()
                )
                
                # Extract landmarks
                landmarks_array = extract_landmarks(hand_landmarks)
                
                # Run prediction using the final pipeline
                prediction = final_pipeline.predict(landmarks_array)[0]
                
                # Get hand label (Left/Right)
                hand_label = results.multi_handedness[hand_idx].classification[0].label
                
                # Draw prediction text in the upper left corner
                text = f"{hand_label}: {prediction}"
                text_x = 10
                text_y = 70 + (hand_idx * 40)  # Stack multiple hands vertically
                
                # Draw background rectangle for better text visibility
                (text_width, text_height), _ = cv2.getTextSize(
                    text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2
                )
                cv2.rectangle(frame, (text_x - 5, text_y - text_height - 5), (text_x + text_width + 5, text_y + 5), (0, 0, 0), -1)
                
                # Draw text
                cv2.putText(frame,text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2, cv2.LINE_AA)
        
        # Display instructions
        cv2.putText(frame, "Press 'q' to quit", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2, cv2.LINE_AA)
        
        # Write frame to output video
        out.write(frame)
        
        # Display the frame
        cv2.imshow('Hand Gesture Recognition', frame)
        
        # Break on 'q' key press
        if cv2.waitKey(5) & 0xFF == ord('q'):
            break
    
    # Release resources
    cap.release()
    out.release()
    cv2.destroyAllWindows()
    print(f"Video capture ended. Video saved to: {output_path}")