In [2]:
import cv2, numpy as np
from insightface.app import FaceAnalysis
cv2.setNumThreads(1)

In [17]:
def estimate_age_gender(video_path, sample_stride=5):
    """
    Returns (age_mean, age_median, gender_mode, stats_dict) from visual frames.
    - sample_stride: analyze every Nth frame for speed
    - gender: 'male' or 'female' (string), based on per-frame argmax logits
    """
    app = FaceAnalysis(name='buffalo_l', allowed_modules=['detection', 'genderage'])  # good default bundle
    app.prepare(ctx_id=0, det_size=(640, 640))  # ctx_id=-1 for CPU, 0 for GPU
    
    # Load video file
    cap = cv2.VideoCapture(video_path)
    assert cap.isOpened(), f"Could not open {video_path}"
    
    ages, genders = [], []
    frame_idx = 0

    # Read every N-th frame
    while True:
        ok, frame = cap.read()
        if not ok:
            break
        if frame_idx % sample_stride != 0:
            frame_idx += 1
            continue
    
        # Detect the most central face.
        faces = app.get(frame)
        if faces:
            # get image center
            H, W = frame.shape[:2]
            cx, cy = W/2, H/2
            # sort faces by acending order from distance to center
            faces.sort(key=lambda f: ( (f.bbox[0]+f.bbox[2])/2 - cx )**2 + ( (f.bbox[1]+f.bbox[3])/2 - cy )**2)
            # pick face closest to the center
            f0 = faces[0]
            # determine age
            if f0.age is not None:
                ages.append(float(f0.age))
            # determine gender
            if f0.gender is not None:
                genders.append('male' if int(f0.gender) == 1 else 'female')

        frame_idx += 1
        
    cap.release()
    
    #if len(ages) < max(5, min_frames):
    #    return None, None, None, {"frames_analyzed": len(ages)}
    
    
    age_arr = np.array(ages, dtype=float)
    # robust smoothing: median is less sensitive; also clip to plausible range
    age_median = int(np.clip(np.median(age_arr), 5, 90)) if ages else None
    age_mean = int(np.clip(np.mean(age_arr), 5, 90)) if ages else None
    
    # pick gender that was detected most often
    gender = max(set(genders), key=genders.count) if genders else None

    return age_median, age_mean, gender, {
        "frames_analyzed": len(ages) or len(genders),
        "age_samples": len(ages),
        "gender_samples": len(genders)
    }

In [18]:
video_path = "/home/ssever/SilentSpeak/data/input_video/avhubert_demo_video_8s.mp4"

estimate_age_gender(video_path)[:3]

Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutionProvider': {}, 'CUDAExecutionProvider': {'sdpa_kernel': '0', 'use_tf32': '1', 'fuse_conv_bias': '0', 'prefer_nhwc': '0', 'tunable_op_max_tuning_duration_ms': '0', 'enable_skip_layer_norm_strict_mode': '0', 'tunable_op_tuning_enable': '0', 'tunable_op_enable': '0', 'use_ep_level_unified_stream': '0', 'device_id': '0', 'has_user_compute_stream': '0', 'gpu_external_empty_cache': '0', 'cudnn_conv_algo_search': 'EXHAUSTIVE', 'cudnn_conv1d_pad_to_nc1d': '0', 'gpu_mem_limit': '18446744073709551615', 'gpu_external_alloc': '0', 'gpu_external_free': '0', 'arena_extend_strategy': 'kNextPowerOfTwo', 'do_copy_in_default_stream': '1', 'enable_cuda_graph': '0', 'user_compute_stream': '0', 'cudnn_conv_use_max_workspace': '1'}}
model ignore: /home/ssever/.insightface/models/buffalo_l/1k3d68.onnx landmark_3d_68
Applied providers: ['CUDAExecutionProvider', 'CPUExecutionProvider'], with options: {'CPUExecutio

(32, 32, 'male')