In [2]:
import mediapipe as mp
print(mp.__version__)
print(mp.solutions.hands)


0.10.13
<module 'mediapipe.python.solutions.hands' from '/Users/xuan035/miniforge3/envs/random/lib/python3.10/site-packages/mediapipe/python/solutions/hands.py'>


In [1]:
import cv2
import torch
import torch.nn as nn
from PIL import Image
import mediapipe as mp
from torchvision import transforms
from torchvision.models import resnet18
from collections import deque, Counter
import os

# Updated webcam demo: supports left/right hand detection with per-hand gesture classification
# Config
MODEL_PATH = "gesture_resnet18.pt"   # same folder as this file
CAMERA_INDEX = 0                     # change to 1 if needed
SMOOTHING_WINDOW = 8

CONF_THRESHOLD = 0.70
UNKNOWN_LABEL = "Unknown"

# Load model
ckpt = torch.load(MODEL_PATH, map_location="cpu")
CLASSES = ckpt["classes"]
IMG_SIZE = ckpt["img_size"]

model = resnet18(weights=None)
model.fc = nn.Linear(model.fc.in_features, len(CLASSES))
model.load_state_dict(ckpt["state_dict"])
model.eval()

# Transforms (must match training)
tfm = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(
        [0.485, 0.456, 0.406],
        [0.229, 0.224, 0.225],
    ),
])

# MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2, # changed from 1 to 2
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5,
)

# Utils
def crop_from_landmarks(frame, hand_landmarks, pad=40):
    h, w, _ = frame.shape
    xs = [int(lm.x * w) for lm in hand_landmarks.landmark]
    ys = [int(lm.y * h) for lm in hand_landmarks.landmark]

    x1 = max(min(xs) - pad, 0)
    x2 = min(max(xs) + pad, w)
    y1 = max(min(ys) - pad, 0)
    y2 = min(max(ys) + pad, h)

    return frame[y1:y2, x1:x2], (x1, y1, x2, y2)

# Runtime state
pred_hist = {
    "Left": deque(maxlen=SMOOTHING_WINDOW),
    "Right": deque(maxlen=SMOOTHING_WINDOW),
}

cap = cv2.VideoCapture(CAMERA_INDEX)

# Main loop
while True:
    ok, frame = cap.read()
    if not ok:
        break

    # mirror view
    frame = cv2.flip(frame, 1)
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    res = hands.process(rgb)

    if res.multi_hand_landmarks and res.multi_handedness:
        for i, hlm in enumerate(res.multi_hand_landmarks):

            handedness = res.multi_handedness[i].classification[0].label  # Left / Right

            # because frame is flipped
            handedness = "Left" if handedness == "Right" else "Right"

            crop, (x1, y1, x2, y2) = crop_from_landmarks(frame, hlm, pad=50)
            if crop.size == 0:
                continue

            pil = Image.fromarray(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
            x = tfm(pil).unsqueeze(0)

            with torch.no_grad():
                logits = model(x)
                probs = torch.softmax(logits, dim=1)[0]
                idx = int(torch.argmax(probs))
                conf = float(probs[idx])

            if conf >= CONF_THRESHOLD:
                pred_hist[handedness].append(idx)
                smooth_idx = Counter(pred_hist[handedness]).most_common(1)[0][0]
                label = CLASSES[smooth_idx]
            else:
                label = UNKNOWN_LABEL

            # draw bounding box
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            # draw label
            cv2.putText(
                frame,
                f"{handedness}: {label} ({conf:.2f})",
                (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.8,
                (255, 255, 255),
                2,
            )

    else:
        cv2.putText(
            frame,
            "No hand",
            (20, 40),
            cv2.FONT_HERSHEY_SIMPLEX,
            1.1,
            (255, 255, 255),
            2,
        )

    cv2.imshow("Gesture Demo", frame)
    if cv2.waitKey(1) & 0xFF == 27:  # ESC to quit
        break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1771819940.524504 3493352 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M4
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
