## MediaPipe Notebook

#### Testing the tool already used

In [None]:
import json
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from math import sqrt
import mediapipe as mp
from collections import defaultdict


In [None]:
frames_dir = Path("frames/frames2")
output_dir = Path("outputs/mediapipe_hands/2")
output_dir.mkdir(parents=True, exist_ok=True)
baseline_json_dir = Path("outputs/simple_landmarks/2/json")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles


In [None]:
mediapipe_hand_to_baseline = {
    "nose": None,
    "left_eye": None,
    "right_eye": None,
    "thumb": 4,
    "pointer_finger": 8,
    "middle_finger": 12,
    "ring_finger": 16,
    "pinky_finger": 20
}

key_order = [
    "nose", "left_eye", "right_eye", "thumb",
    "pointer_finger", "middle_finger", "ring_finger", "pinky_finger"
]


In [None]:
# Process frames with MediaPipe Hands
json_data = defaultdict(list)

with mp_hands.Hands(
    static_image_mode=True,
    max_num_hands=2,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
) as hands:
    
    for i in range(1, 7):
        frame_name = f"frame_{i:02d}.jpg"
        frame_path = frames_dir / frame_name
        
        if not frame_path.exists():
            continue
        
        img = cv2.imread(str(frame_path))
        if img is None:
            continue
        
        h, w = img.shape[:2]
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        results = hands.process(img_rgb)
        
        vis_img = img.copy()
        
        if results.multi_hand_landmarks:
            hand_landmarks = results.multi_hand_landmarks[0]
            handedness = results.multi_handedness[0].classification[0].label
            
            # Draw hand landmarks
            mp_drawing.draw_landmarks(
                vis_img,
                hand_landmarks,
                mp_hands.HAND_CONNECTIONS,
                mp_drawing_styles.get_default_hand_landmarks_style(),
                mp_drawing_styles.get_default_hand_connections_style()
            )
            
            # Extract keypoints
            keypoints = []
            for lm in hand_landmarks.landmark:
                x = lm.x * w
                y = lm.y * h
                visibility = 1.0
                keypoints.extend([float(x), float(y), float(visibility)])
            
            json_data[frame_name].append({
                "handedness": handedness,
                "keypoints": keypoints,
                "num_landmarks": len(hand_landmarks.landmark)
            })
        else:
            json_data[frame_name].append({
                "handedness": None,
                "keypoints": [],
                "num_landmarks": 0
            })
        
        # Save visualization
        vis_path = output_dir / f"{frame_name}"
        cv2.imwrite(str(vis_path), vis_img)

# Save JSON
json_path = output_dir / "mediapipe_hands_landmarks.json"
with open(json_path, 'w') as f:
    json.dump(dict(json_data), f, indent=2)



In [None]:
# Comparison with baseline
def euclidean_distance(p1, p2):
    return sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)

def draw_keypoints(image, points, color, label_prefix=""):
    for i, (x, y) in enumerate(points):
        if x > 0 and y > 0:
            cv2.circle(image, (int(x), int(y)), 5, color, -1)
            cv2.putText(image, f"{label_prefix}{i+1}", (int(x)+5, int(y)-5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1, cv2.LINE_AA)

all_frame_errors = []

for i in range(1, 7):
    frame_name = f"frame_{i:02d}.jpg"
    baseline_path = baseline_json_dir / f"frame_{i:02d}.json"
    
    if not baseline_path.exists():
        continue
    
    if frame_name not in json_data or len(json_data[frame_name]) == 0:
        continue
    
    detection = json_data[frame_name][0]
    keypoints = detection.get("keypoints", [])
    
    if len(keypoints) == 0:
        continue
    
    # Convert keypoints from [x, y, v, x, y, v, ...] to list of (x, y) tuples
    # MediaPipe Hands has 21 landmarks, so 21 * 3 = 63 values
    hand_points_all = [(keypoints[j], keypoints[j+1]) for j in range(0, len(keypoints), 3)]
    
    hand_points = []
    for k in key_order:
        idx = mediapipe_hand_to_baseline.get(k)
        if idx is not None and idx < len(hand_points_all):
            hand_points.append(hand_points_all[idx])
        else:
            hand_points.append((0, 0))
    
    with open(baseline_path, "r") as f:
        baseline_data = json.load(f)
    baseline_points = [(baseline_data[k]["coordinates"]["x"], baseline_data[k]["coordinates"]["y"]) for k in key_order]
    
    img_path = frames_dir / frame_name
    img = cv2.imread(str(img_path))
    if img is None:
        continue
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    draw_keypoints(img, baseline_points, (0, 255, 0), "B")
    draw_keypoints(img, hand_points, (255, 0, 0), "H")
    
    errors = []
    for b, h in zip(baseline_points, hand_points):
        if h[0] > 0 and h[1] > 0:
            dist = euclidean_distance(b, h)
            errors.append(dist)
            cv2.line(img, (int(b[0]), int(b[1])), (int(h[0]), int(h[1])), (255, 255, 0), 1)
        else:
            errors.append(np.nan)
    
    hand_errors = [e for e, k in zip(errors, key_order) if k in ["thumb", "pointer_finger", "middle_finger", "ring_finger", "pinky_finger"] and not np.isnan(e)]
    if hand_errors:
        mean_error = np.mean(hand_errors)
        max_error = np.max(hand_errors)
        all_frame_errors.append(mean_error)
    else:
        mean_error = np.nan
        max_error = np.nan
    
    plt.figure(figsize=(8, 8))
    plt.imshow(img)
    plt.title(f"{frame_name}: Mean hand error={mean_error:.2f}px | Max={max_error:.2f}px")
    plt.axis("off")
    plt.show()
    
    # Print numerical summary
    print(f"\nFrame {i:02d} Results (MediaPipe Hands)")
    for k, dist in zip(key_order, errors):
        if not np.isnan(dist):
            print(f"  {k:15s}: {dist:7.2f}px")
        else:
            print(f"  {k:15s}: N/A")
    if not np.isnan(mean_error):
        print(f"  Mean hand error: {mean_error:.2f}px | Max hand error: {max_error:.2f}px")

# Overall summary
if all_frame_errors:
    print(f"Average mean error (hand landmarks only): {np.mean(all_frame_errors):.2f}px")
    print(f"Best (lowest) frame mean error: {np.min(all_frame_errors):.2f}px")
    print(f"Worst (highest) frame mean error: {np.max(all_frame_errors):.2f}px")
