In [1]:
import cv2
import numpy as np
import mediapipe as mp
import math

In [2]:
def _get_duration(video_path):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = int(math.ceil(frame_count/fps))
    return duration

In [3]:
# Inner/outer lips landmark indices (MediaPipe FaceMesh, 468 points).
# We'll compute mouth opening from MANY points to reduce noise.
# Sources summarizing lip indices and FaceMesh basics: MediaPipe docs + community maps.
UPPER_LIPS = [13, 82, 81, 42, 183, 78]      # upper inner/near-inner band (includes mid 13)
LOWER_LIPS = [14, 87, 178, 88, 95]          # lower inner/near-inner band (includes mid 14)
MOUTH_CORNERS_INNER = (78, 308)             # inner corners (more stable for width than 61/291)

In [4]:
def _xy_from_landmarks(landmarks, w, h, idx):
    lm = landmarks[idx]
    return np.array([lm.x * w, lm.y * h], dtype=np.float32)

In [5]:
def _mouth_width(landmarks, w, h):
    L = _xy_from_landmarks(landmarks, w, h, MOUTH_CORNERS_INNER[0])
    R = _xy_from_landmarks(landmarks, w, h, MOUTH_CORNERS_INNER[1])
    return float(np.linalg.norm(R - L) + 1e-6)

In [6]:
def _aperture_from_many_pairs(landmarks, w, h):
    """
    Robust mouth opening:
    - Builds sets of upper/lower lip points.
    - Pairs each upper point to the closest-by-x lower point.
    - Takes the median vertical gap and normalize by inner-corner width.
    """
    width = _mouth_width(landmarks, w, h)
    if width <= 1e-6:
        return 0.0

    upp = np.array([_xy_from_landmarks(landmarks, w, h, i) for i in UPPER_LIPS])
    low = np.array([_xy_from_landmarks(landmarks, w, h, i) for i in LOWER_LIPS])

    # Pair by nearest x (prevents relying on exact index correspondences)
    gaps = []
    for u in upp:
        j = np.argmin(np.abs(low[:,0] - u[0]))  # nearest x
        vgap = abs(low[j,1] - u[1])            # vertical distance
        gaps.append(vgap)
    if not gaps:
        return 0.0

    return float(np.median(gaps) / width)

In [7]:
def precheck_video_for_speaking(
    video_path: str,            
    sample_fps: float = 5.0,           # sample rate for the precheck
    min_face_fraction: float = 0.2,   # require faces in at least 35% of sampled frames
    min_modulation_std: float = 0.015, # require some variance over time
    adapt_k: float = 0.6,              # how far above baseline we call it "open"
    min_open_fraction: float = 0.20    # require at least 30% of face frames to be “open”  -> i.e. person is talking
):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        return dict(ok=False, reason=f"Could not open {video_path}")
    
    #video_duration = _get_duration(video_path)
    video_duration = int(45)

    native_fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
    total_frames_to_scan = int(min(video_duration * native_fps, cap.get(cv2.CAP_PROP_FRAME_COUNT) or 1e9))
    step = max(int(round(native_fps / sample_fps)), 1)

    mp_face_mesh = mp.solutions.face_mesh
    face_mesh = mp_face_mesh.FaceMesh(
        static_image_mode=False,
        max_num_faces=1,
        refine_landmarks=True,         # << better lip detail
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5,
    )

    sampled, face_frames = 0, 0
    apertures = []
    frame_idx = 0
    while frame_idx < total_frames_to_scan:
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ok, frame = cap.read()
        if not ok:
            break
        h, w = frame.shape[:2]
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        res = face_mesh.process(rgb)
        sampled += 1

        if res.multi_face_landmarks:
            face_frames += 1
            lms = res.multi_face_landmarks[0].landmark
            apertures.append(_aperture_from_many_pairs(lms, w, h))

        frame_idx += step

    cap.release()
    try:
        face_mesh.close()
    except Exception:
        pass

    if sampled == 0:
        return dict(ok=False, reason="No frames sampled")

    face_fraction = face_frames / sampled
    if face_fraction < min_face_fraction:
        return dict(
            ok=False,
            reason=f"Face too infrequent: {face_fraction:.2f} < {min_face_fraction}",
            face_fraction=face_fraction
        )

    if len(apertures) < 5:
        return dict(ok=False, reason="Too few lip samples", face_fraction=face_fraction)

    a = np.array(apertures, dtype=np.float32)
    # Smooth slightly to suppress per-frame jitter
    if len(a) >= 5:
        a = np.convolve(a, np.ones(5)/5.0, mode="same")

    # Adaptive threshold: baseline = low quantile; high = 90th quantile
    base = float(np.quantile(a, 0.20))   # ~closed mouth level
    hi   = float(np.quantile(a, 0.90))   # very open
    thr  = base + adapt_k * max(hi - base, 1e-6)

    open_ratio = float(np.mean(a > thr))
    a_std = float(a.std())

    talking_like = (a_std >= min_modulation_std) and (open_ratio >= min_open_fraction)
    return dict(
        ok=talking_like,
        reason=("Face present and speech-like mouth motion" if talking_like
                else f"Not speech-like: open_ratio={open_ratio:.2f} (min {min_open_fraction}), std={a_std:.3f} (min {min_modulation_std})"),
        face_fraction=face_fraction,
        open_ratio=open_ratio,
        aperture_std=a_std,
        thr=thr,
        stats=dict(min=float(a.min()), med=float(np.median(a)), p90=hi, base=base)
    )

In [None]:
video_path = "/home/ssever/SilentSpeak/data/input_video/How To Talk To Camera_ The 3 FUNDAMENTALS.mp4"

check = precheck_video_for_speaking(video_path)
check

I0000 00:00:1757183794.514027   44331 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1757183794.529857   44490 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.1), renderer: llvmpipe (LLVM 19.1.1, 256 bits)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1757183794.533856   44433 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757183794.552621   44435 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1757183794.597387   44433 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
