In [1]:
%pip install mediapipe==0.10.14 numpy==2.1.1 opencv-contrib-python==4.10.0.84 opencv-python==4.10.0.84 scikit-learn==1.5.2 streamlit==1.38.0


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import cv2
import os
import numpy as np
import mediapipe as mp

# Khởi tạo mediapipe
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Đường dẫn lưu dữ liệu
DATA_PATH = r'D:\Hoc\mediapipe\data\data_sign_language_mediapipe'

# Chỉ cần gán trực tiếp tên hành động ở đây:
action = 'khoe'          
no_sequences = 70         # số video mẫu
sequence_length =   40    # số frame mỗi video

# Tạo thư mục cho hành động này
action_path = os.path.join(DATA_PATH, action)
os.makedirs(action_path, exist_ok=True)
for seq in range(no_sequences):
    os.makedirs(os.path.join(action_path, str(seq)), exist_ok=True)

# Hàm chuẩn hóa keypoints (pose, face, hand)

def normalize_full_landmarks(pose, face, lh, rh):
    if pose.shape[0] == 0:
        total = face.shape[0] + lh.shape[0] + rh.shape[0]
        return np.zeros((total, 3))

    # Tính tâm trung bình của pose
    center = np.mean(pose, axis=0)

    # Tính khoảng cách và tránh chia cho 0 bằng cách sử dụng np.clip
    dists = np.linalg.norm(pose - center, axis=1)
    dists = np.clip(dists, 1e-6, np.inf)  # Thay thế các giá trị dists = 0 bằng 1e-6

    # Tính tỷ lệ chuẩn hóa
    scales = 1.0 / dists

    # Chuẩn hóa các keypoints với tỷ lệ chuẩn hóa tương ứng
    p_norm = (pose - center) * scales[:, None]
    mean_scale = np.mean(scales)

    # Chuẩn hóa các keypoints khác (face, lh, rh) với mean_scale
    f_norm = (face - center) * mean_scale if face.shape[0] > 0 else face
    lh_norm = (lh - center) * mean_scale if lh.shape[0] > 0 else lh
    rh_norm = (rh - center) * mean_scale if rh.shape[0] > 0 else rh

    return np.vstack([p_norm, f_norm, lh_norm, rh_norm])


# Hàm trích xuất keypoints
def extract_keypoints(results):
    pose = np.array([[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark]) if results.pose_landmarks else np.zeros((33, 3))
    face = np.array([[lm.x, lm.y, lm.z] for lm in results.face_landmarks.landmark]) if results.face_landmarks else np.zeros((468, 3))
    lh = np.array([[lm.x, lm.y, lm.z] for lm in results.left_hand_landmarks.landmark]) if results.left_hand_landmarks else np.zeros((21, 3))
    rh = np.array([[lm.x, lm.y, lm.z] for lm in results.right_hand_landmarks.landmark]) if results.right_hand_landmarks else np.zeros((21, 3))

    # Chuẩn hóa keypoints (pose, face, lh, rh)
    normalized = normalize_full_landmarks(pose, face, lh, rh)
    return normalized.flatten()

# Hàm xử lý frame và nhận diện
def mediapipe_detection(img, model):
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_rgb.flags.writeable = False
    res = model.process(img_rgb)
    img_rgb.flags.writeable = True
    return cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR), res

# Hàm vẽ landmarks
def draw_landmarks(img, res):
    if res.pose_landmarks:    mp_drawing.draw_landmarks(img, res.pose_landmarks,    mp_holistic.POSE_CONNECTIONS)
    if res.face_landmarks:    mp_drawing.draw_landmarks(img, res.face_landmarks,    mp_holistic.FACEMESH_TESSELATION)
    if res.left_hand_landmarks:  mp_drawing.draw_landmarks(img, res.left_hand_landmarks,  mp_holistic.HAND_CONNECTIONS)
    if res.right_hand_landmarks: mp_drawing.draw_landmarks(img, res.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

# Mở camera và thu thập
cap = cv2.VideoCapture(0)
with mp_holistic.Holistic(min_detection_confidence=0.5,
                          min_tracking_confidence=0.5) as holistic:
    for seq in range(no_sequences):
        for frame_num in range(sequence_length):
            ret, frame = cap.read()
            img, results = mediapipe_detection(frame, holistic)
            draw_landmarks(img, results)

            if frame_num == 0:
                cv2.putText(img, f'START {action} seq {seq}', (120,200),
                            cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 4)
                cv2.imshow('Feed', img)
                cv2.waitKey(2000)
            else:
                cv2.putText(img, f'{action} seq {seq} frame {frame_num}', (10,30),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 1)
                cv2.imshow('Feed', img)

            # Lưu keypoints đã chuẩn hóa
            keypts = extract_keypoints(results)
            save_path = os.path.join(DATA_PATH, action, str(seq), f"{frame_num}.npy")
            np.save(save_path, keypts)

            if cv2.waitKey(10) & 0xFF == ord('q'):
                break

cap.release()
cv2.destroyAllWindows()




In [2]:
## test với sequence

In [2]:
import cv2
import numpy as np
import mediapipe as mp
from tensorflow.keras.models import load_model  # Sử dụng tensorflow.keras.models
import pickle
from collections import Counter

# Đường dẫn model và label map
MODEL_PATH = r'D:\Hoc\mediapipe\data\code_final\sign_language_LSTM_AUGdata_model.keras'
LABEL_MAP_PATH = r'D:\Hoc\mediapipe\data\code_final\label_map_LSTM_AUGdata_.pkl'

# Load model
model = load_model(MODEL_PATH)
print("✅ Model đã load thành công!")

# Load label map
with open(LABEL_MAP_PATH, 'rb') as f:
    label_map = pickle.load(f)

first_key = next(iter(label_map))
if isinstance(first_key, str):
    index_to_label = {v: k for k, v in label_map.items()}
else:
    index_to_label = label_map





# Mediapipe setup
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

# Tham số
SEQUENCE_LENGTH = 40
THRESHOLD = 0.9
PREDICTION_HISTORY_SIZE = 5

sequence = []
last_preds = []

# Hàm normalize
def normalize_full_landmarks(pose, face, lh, rh):
    if pose.shape[0] == 0:
        total = face.shape[0] + lh.shape[0] + rh.shape[0]
        return np.zeros((total, 3))
    
    center = np.mean(pose, axis=0)
    dists = np.linalg.norm(pose - center, axis=1)
    dists[dists == 0] = 1e-6
    scales = 1.0 / dists
    p_norm = (pose - center) * scales[:, None]
    
    ms = np.mean(scales)
    f_norm = (face - center) * ms if face.shape[0] > 0 else face
    lh_norm = (lh - center) * ms if lh.shape[0] > 0 else lh
    rh_norm = (rh - center) * ms if rh.shape[0] > 0 else rh
    
    return np.vstack([p_norm, f_norm, lh_norm, rh_norm])

# Trích xuất keypoints
def extract_keypoints(results):
    pose = np.array([[lm.x, lm.y, lm.z] for lm in results.pose_landmarks.landmark]) if results.pose_landmarks else np.zeros((33, 3))
    face = np.array([[lm.x, lm.y, lm.z] for lm in results.face_landmarks.landmark]) if results.face_landmarks else np.zeros((468, 3))
    lh = np.array([[lm.x, lm.y, lm.z] for lm in results.left_hand_landmarks.landmark]) if results.left_hand_landmarks else np.zeros((21, 3))
    rh = np.array([[lm.x, lm.y, lm.z] for lm in results.right_hand_landmarks.landmark]) if results.right_hand_landmarks else np.zeros((21, 3))
    return normalize_full_landmarks(pose, face, lh, rh).flatten()

# Mediapipe detection
def mediapipe_detection(frame, model):
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    img.flags.writeable = False
    results = model.process(img)
    img.flags.writeable = True
    return cv2.cvtColor(img, cv2.COLOR_RGB2BGR), results

# Real-time capture
cap = cv2.VideoCapture(0)

with mp_holistic.Holistic(min_detection_confidence=0.5,
                          min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        image, results = mediapipe_detection(frame, holistic)

        # Vẽ landmarks
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
        mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS)
        mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
        mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)

        # Trích xuất keypoints
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-SEQUENCE_LENGTH:]  # Giữ đúng 40 frame gần nhất

        text = 'Collecting...'
        color = (0, 255, 255)

        if len(sequence) == SEQUENCE_LENGTH:
            input_seq = np.expand_dims(sequence, axis=0)  # shape (1, 40, 1629)
            preds = model.predict(input_seq, verbose=0)[0]  # Dự đoán
            pred_class = np.argmax(preds)
            confidence = preds[pred_class]

            if confidence >= THRESHOLD:
                pred_label = index_to_label[pred_class]
            else:
                pred_label = 'Unknown'

            last_preds.append(pred_label)
            if len(last_preds) > PREDICTION_HISTORY_SIZE:
                last_preds.pop(0)

            # Đếm dự đoán ổn định
            most_common = Counter(last_preds).most_common(1)[0]
            stable_label, count = most_common

            if count >= 4 and stable_label != 'Unknown':
                text = f'{stable_label} ({confidence*100:.1f}%)'
                color = (0, 255, 0)  # Xanh lá
            else:
                text = 'Unknown'
                color = (0, 0, 255)  # Đỏ

        # Hiển thị
        cv2.putText(image, text, (10, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2)

        cv2.imshow('Real-Time Sign Detection', image)

        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

cap.release()
cv2.destroyAllWindows()


✅ Model đã load thành công!
