In [None]:
!pip install openmim
!pip install git+https://github.com/jin-s13/xtcocoapi
!pip install deepface mediapipe
!pip install facenet_pytorch
!pip install filterpy
!pip install tqdm scikit-learn scikit-image

In [None]:
!pip uninstall torch torchvision torchaudio -y
!pip install torch==2.0.1+cu117 torchvision==0.15.2+cu117 -f https://download.pytorch.org/whl/torch_stable.html
!mim install mmengine
!pip install mmcv==2.0.1 -f https://download.openmmlab.com/mmcv/dist/cu118/torch2.0/index.html
!mim install mmdet==3.2.0
!git clone https://github.com/open-mmlab/mmpose.git
%cd mmpose
!pip install -e .
!pip install "numpy<2.0"
#restart

In [None]:
%cd mmpose

/content/mmpose


In [None]:
import mmcv
from mmcv import imread
import mmengine
from mmengine.registry import init_default_scope
import numpy as np
from deepface import DeepFace
from collections import defaultdict
import cv2
import os
from tqdm import tqdm
import torch
import time
from sklearn.metrics.pairwise import cosine_similarity
from skimage.feature import local_binary_pattern
from scipy.optimize import linear_sum_assignment
from facenet_pytorch import MTCNN

from mmpose.apis import inference_topdown
from mmpose.apis import init_model as init_pose_estimator
from mmpose.evaluation.functional import nms
from mmpose.registry import VISUALIZERS
from mmpose.structures import merge_data_samples

from mmdet.apis import inference_detector, init_detector
torch.cuda.set_device(0)
device = 'cuda:0'

#model initialization

#det model
det_config = 'projects/rtmpose/rtmdet/person/rtmdet_m_640-8xb32_coco-person.py'
det_checkpoint = 'https://download.openmmlab.com/mmpose/v1/projects/rtmpose/rtmdet_m_8xb32-100e_coco-obj365-person-235e8209.pth'

#HPE model
pose_config = 'configs/wholebody_2d_keypoint/topdown_heatmap/coco-wholebody/td-hm_hrnet-w48_dark-8xb32-210e_coco-wholebody-384x288.py'
pose_checkpoint = 'https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_wholebody_384x288_dark-f5726563_20200918.pth'

cfg_options = dict(model=dict(test_cfg=dict(output_heatmaps=True)))

# Initialize models on GPU
detector = init_detector(det_config, det_checkpoint, device=device)
detector.to(device)

pose_estimator = init_pose_estimator(
    pose_config,
    pose_checkpoint,
    device=device,
    cfg_options=cfg_options
)
pose_estimator.to(device)

# pose visualizer
pose_estimator.cfg.visualizer.radius = 3
pose_estimator.cfg.visualizer.line_width = 1
visualizer = VISUALIZERS.build(pose_estimator.cfg.visualizer)
visualizer.set_dataset_meta(pose_estimator.dataset_meta)

class PersonTracker:
    def __init__(self, fps=30):
        self.next_person_id = 1
        self.active_tracks = {}  # Currently visible persons
        self.lost_tracks = {}    # Recently lost persons (for re-ID)
        self.track_history = {}  # Complete history for better re-identification

        # Tracking thresholds
        self.iou_threshold = 0.4
        self.appearance_threshold = 0.6
        self.face_threshold = 0.65

        # Frame management
        self.frame_count = 0
        self.fps = fps
        self.max_lost_frames = int(fps * 5)  # 5 seconds
        self.max_retire_frames = int(fps * 120)  # 30 seconds before permanent deletion

        # Feature weights
        self.iou_weight = 0.2
        self.appearance_weight = 0.6
        self.face_weight = 0.2

        # Person validation
        self.min_person_height = 180
        self.min_person_width = 60
        self.max_aspect_ratio = 4.0

        # Stability features
        self.min_track_confidence = 0.6
        self.track_history_length = 10
        self.position_history_length = 20

        # Motion prediction
        self.use_motion_prediction = True
        self.velocity_weight = 0.2

        # ID management
        self.max_reid_attempts = 3  # How many times to try re-identification
        # Track confirmation
        self.min_initial_frames = 5  # Require 5 consecutive detections before confirming track
        self.confirmed_tracks = set()  # Tracks that have passed initialization

    def _extract_features(self, person_img):
        """Enhanced feature extraction with more stable descriptors"""
        try:
            if person_img.size == 0:
                return None

            # Standardize size and convert color spaces
            person_img = cv2.resize(person_img, (128, 256))  # larger size for better features
            hsv = cv2.cvtColor(person_img, cv2.COLOR_BGR2HSV)
            lab = cv2.cvtColor(person_img, cv2.COLOR_BGR2LAB)

            # More comprehensive color histograms
            h_hist = cv2.calcHist([hsv], [0], None, [16], [0, 180])
            s_hist = cv2.calcHist([hsv], [1], None, [16], [0, 256])
            l_hist = cv2.calcHist([lab], [0], None, [16], [0, 256])

            # Normalize histograms
            h_hist = cv2.normalize(h_hist, h_hist).flatten()
            s_hist = cv2.normalize(s_hist, s_hist).flatten()
            l_hist = cv2.normalize(l_hist, l_hist).flatten()

            # Enhanced texture features
            gray = cv2.cvtColor(person_img, cv2.COLOR_BGR2GRAY)

            # Multiple LBP configurations
            lbp1 = local_binary_pattern(gray, 8, 1, method='uniform')
            lbp2 = local_binary_pattern(gray, 16, 2, method='uniform')
            lbp_hist1 = np.histogram(lbp1, bins=16, range=(0, 16))[0]
            lbp_hist2 = np.histogram(lbp2, bins=16, range=(0, 16))[0]

            # HOG features
            hog = cv2.HOGDescriptor((128, 256), (16,16), (8,8), (8,8), 9)
            hog_features = hog.compute(person_img).flatten()

            # Combine features with more weighting on structural features
            return np.concatenate([
                h_hist * 0.5,
                s_hist * 0.5,
                l_hist * 0.5,
                lbp_hist1 * 1.0,
                lbp_hist2 * 1.0,
                hog_features * 1.5
            ])

        except Exception as e:
            print(f"Feature extraction error: {e}")
            return None


    def _get_face_embedding(self, face_img):
        """Extract face embedding with quality checks"""
        try:
            if face_img.size == 0 or face_img.shape[0] < 40 or face_img.shape[1] < 40:
                return None

            # Enhance face quality
            face_img = cv2.resize(face_img, (160, 160))
            gray = cv2.cvtColor(face_img, cv2.COLOR_BGR2GRAY)
            face_img = cv2.equalizeHist(gray)
            face_img = cv2.cvtColor(face_img, cv2.COLOR_GRAY2BGR)

            embedding = DeepFace.represent(face_img, model_name='Facenet', enforce_detection=False)
            return np.array(embedding[0]['embedding'])
        except Exception as e:
            return None

    def _predict_position(self, track):
        """Predict next position based on motion history"""
        if len(track.get('position_history', [])) < 2:
            return track['last_bbox']

        positions = track['position_history']
        recent_positions = positions[-3:]  # uses last 3 positions

        # Calculate average velocity
        velocities = []
        for i in range(1, len(recent_positions)):
            prev_center = self._get_bbox_center(recent_positions[i-1])
            curr_center = self._get_bbox_center(recent_positions[i])
            velocity = (curr_center[0] - prev_center[0], curr_center[1] - prev_center[1])
            velocities.append(velocity)

        if not velocities:
            return track['last_bbox']

        avg_velocity = (
            sum(v[0] for v in velocities) / len(velocities),
            sum(v[1] for v in velocities) / len(velocities)
        )

        # Predict next position
        last_bbox = track['last_bbox']
        last_center = self._get_bbox_center(last_bbox)
        predicted_center = (
            last_center[0] + avg_velocity[0],
            last_center[1] + avg_velocity[1]
        )

        # Create predicted bbox
        w, h = last_bbox[2] - last_bbox[0], last_bbox[3] - last_bbox[1]
        predicted_bbox = [
            predicted_center[0] - w/2,
            predicted_center[1] - h/2,
            predicted_center[0] + w/2,
            predicted_center[1] + h/2
        ]

        return predicted_bbox

    def _get_bbox_center(self, bbox):
        """Get center point of bounding box"""
        return ((bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2)
    def _calculate_similarity(self, detection, track):
        """Enhanced similarity scoring with more features"""
        scores = {}

        # 1. IoU score with predicted position
        if self.use_motion_prediction and len(track.get('position_history', [])) >= 2:
            predicted_bbox = self._predict_position(track)
            iou_score = self._calculate_iou(detection['bbox'], predicted_bbox)
            iou_score_last = self._calculate_iou(detection['bbox'], track['last_bbox'])
            iou_score = max(iou_score, iou_score_last)
        else:
            iou_score = self._calculate_iou(detection['bbox'], track['last_bbox'])
        scores['iou'] = iou_score

        # 2. Appearance score (uses multiple historical features)
        app_score = 0
        if detection['features'] is not None and track['features']:
            # Compare with multiple historical features
            for feat in track['features'][-5:]:  # Last 5 features
                score = self._compare_features(detection['features'], feat)
                app_score = max(app_score, score * 0.9)  # slight decay for older features

            #also compare with the first feature (for long-term consistency)
            if len(track['features']) > 5:
                first_score = self._compare_features(detection['features'], track['features'][0])
                app_score = max(app_score, first_score * 0.7)
        scores['appearance'] = app_score

        # 3. Face score with quality checks
        face_score = 0
        if detection['face_embedding'] is not None and track['face_embeddings']:
            # Compare with all face embeddings, weighted by recency
            for i, face_emb in enumerate(track['face_embeddings'][-3:]):
                score = self._compare_features(detection['face_embedding'], face_emb)
                weight = 0.9 ** (len(track['face_embeddings'][-3:]) - i - 1)  # Recent faces weighted higher
                face_score = max(face_score, score * weight)
        scores['face'] = face_score

        # 4. Motion consistency score
        motion_score = 0
        if len(track.get('position_history', [])) >= 2:
            last_pos = track['position_history'][-1]
            current_pos = detection['bbox']

            # Calculate expected position based on velocity
            if len(track['position_history']) >= 2:
                prev_pos = track['position_history'][-2]
                velocity = (
                    (last_pos[0] - prev_pos[0]),
                    (last_pos[1] - prev_pos[1])
                )
                expected_pos = (
                    last_pos[0] + velocity[0],
                    last_pos[1] + velocity[1],
                    last_pos[2] + velocity[0],
                    last_pos[3] + velocity[1]
                )
                motion_score = self._calculate_iou(current_pos, expected_pos)
        scores['motion'] = motion_score

        # 5. Size consistency score
        size_score = 0
        if len(track.get('position_history', [])) > 0:
            last_bbox = track['last_bbox']
            last_area = (last_bbox[2] - last_bbox[0]) * (last_bbox[3] - last_bbox[1])
            curr_area = (detection['bbox'][2] - detection['bbox'][0]) * (detection['bbox'][3] - detection['bbox'][1])
            size_ratio = min(last_area, curr_area) / (max(last_area, curr_area) + 1e-7)
            size_score = size_ratio
        scores['size'] = size_score

        # Combined score with updated weights
        combined_score = (
            self.iou_weight * scores['iou'] +
            self.appearance_weight * scores['appearance'] +
            self.face_weight * scores['face'] +
            0.15 * scores['motion'] +  # Motion consistency
            0.1 * scores['size']      # Size consistency
        )

        # Apply temporal decay for lost tracks
        if 'last_seen' in track and track['last_seen'] < self.frame_count:
            frames_missing = self.frame_count - track['last_seen']
            decay_factor = max(0.5, 1.0 - (frames_missing / self.max_lost_frames))
            combined_score *= decay_factor

        return combined_score, scores


    def _match_detections_to_tracks(self, detections, track_dict, is_lost_tracks=False):
        """Improved matching with cascaded matching strategy"""
        if not detections or not track_dict:
            return {}

        # First stage: Strong matches only
        strong_matches = {}
        track_ids = list(track_dict.keys())
        cost_matrix = np.zeros((len(detections), len(track_ids)))

        for det_idx, detection in enumerate(detections):
            for track_idx, track_id in enumerate(track_ids):
                track = track_dict[track_id]
                combined_score, score_details = self._calculate_similarity(detection, track)

                # Require strong appearance or face match
                if (score_details['appearance'] > 0.7 or
                    (score_details['face'] > 0.65 and score_details['face'] > 0)):
                    cost_matrix[det_idx, track_idx] = 1.0 - combined_score

        # Hungarian algorithm for strong matches
        det_indices, track_indices = linear_sum_assignment(cost_matrix)
        for det_idx, track_idx in zip(det_indices, track_indices):
            cost = cost_matrix[det_idx, track_idx]
            similarity = 1.0 - cost
            if similarity >= 0.6:  # Higher threshold for strong matches
                strong_matches[det_idx] = track_ids[track_idx]

        # Second stage: Standard matching for remaining
        remaining_detections = [i for i in range(len(detections)) if i not in strong_matches]
        remaining_tracks = [tid for tid in track_ids if tid not in strong_matches.values()]

        standard_matches = {}
        if remaining_detections and remaining_tracks:
            cost_matrix = np.zeros((len(remaining_detections), len(remaining_tracks)))

            for det_idx, orig_det_idx in enumerate(remaining_detections):
                detection = detections[orig_det_idx]
                for track_idx, track_id in enumerate(remaining_tracks):
                    track = track_dict[track_id]
                    combined_score, _ = self._calculate_similarity(detection, track)
                    cost_matrix[det_idx, track_idx] = 1.0 - combined_score

            # Hungarian algorithm for standard matches
            det_indices, track_indices = linear_sum_assignment(cost_matrix)
            for det_idx, track_idx in zip(det_indices, track_indices):
                cost = cost_matrix[det_idx, track_idx]
                similarity = 1.0 - cost
                min_threshold = 0.4 if is_lost_tracks else 0.5
                if similarity >= min_threshold:
                    orig_det_idx = remaining_detections[det_idx]
                    standard_matches[orig_det_idx] = remaining_tracks[track_idx]

        return {**strong_matches, **standard_matches}

    def update(self, frame, pose_results, face_boxes):
        """Main tracking update"""
        self.frame_count += 1

        # Extract detections
        detections = []
        for pose_idx, pose in enumerate(pose_results):
            if hasattr(pose.pred_instances, 'bboxes') and len(pose.pred_instances.bboxes) > 0:
                bbox = pose.pred_instances.bboxes[0]
                if hasattr(bbox, 'cpu'):
                    bbox = bbox.cpu().numpy()
                elif not isinstance(bbox, np.ndarray):
                    bbox = np.array(bbox)

                if not self._validate_detection(bbox):
                    continue

                # Extract person image and features
                x1, y1, x2, y2 = bbox.astype(int)
                x1, y1 = max(0, x1), max(0, y1)
                x2, y2 = min(frame.shape[1], x2), min(frame.shape[0], y2)

                if x2 > x1 and y2 > y1:
                    person_img = frame[y1:y2, x1:x2]
                    features = self._extract_features(person_img)

                    # Face matching
                    face_embedding = None
                    best_face_overlap = 0

                    for face in face_boxes:
                        face_x, face_y, face_w, face_h = face['box']
                        face_bbox = [face_x, face_y, face_x + face_w, face_y + face_h]
                        overlap = self._calculate_iou(bbox, face_bbox)

                        if overlap > best_face_overlap and overlap > 0.1:
                            best_face_overlap = overlap
                            face_embedding = self._get_face_embedding(face['img'])

                    keypoints = pose.pred_instances.keypoints[0]
                    if hasattr(keypoints, 'cpu'):
                        keypoints = keypoints.cpu().numpy()

                    detections.append({
                        'pose_idx': pose_idx,
                        'bbox': bbox,
                        'features': features,
                        'face_embedding': face_embedding,
                        'keypoints': keypoints,
                        'person_img': person_img
                    })

        # Match with active tracks first
        active_matches = self._match_detections_to_tracks(detections, self.active_tracks)

        # Match remaining detections with lost tracks (re-identification)
        unmatched_detection_indices = [i for i in range(len(detections)) if i not in active_matches]
        unmatched_detections = [detections[i] for i in unmatched_detection_indices]

        lost_matches = {}
        if unmatched_detections and self.lost_tracks:
            # Try multiple times with different thresholds for better re-ID
            for attempt in range(self.max_reid_attempts):
                temp_threshold = 0.3 + (attempt * 0.05)  # Gradually relax threshold
                temp_matches = self._match_detections_to_tracks(
                    unmatched_detections,
                    self.lost_tracks,
                    is_lost_tracks=True
                )

                # Convert back to original indices
                for unmatched_idx, track_id in temp_matches.items():
                    original_det_idx = unmatched_detection_indices[unmatched_idx]
                    lost_matches[original_det_idx] = track_id

                # Remove matched detections from next attempt
                unmatched_detection_indices = [i for i in unmatched_detection_indices if i not in lost_matches]
                unmatched_detections = [detections[i] for i in unmatched_detection_indices]

                if not unmatched_detections:
                    break

        all_matches = {**active_matches, **lost_matches}

        # Update matched tracks
        for det_idx, track_id in all_matches.items():
            detection = detections[det_idx]

            # Move from lost to active if necessary
            if track_id in self.lost_tracks:
                self.active_tracks[track_id] = self.lost_tracks.pop(track_id)
                print(f"Re-identified person {track_id} after {self.frame_count - self.active_tracks[track_id]['last_seen']} frames")

            track = self.active_tracks[track_id]

            # Update track information
            track['last_bbox'] = detection['bbox']
            track['last_seen'] = self.frame_count
            track['track_length'] += 1

            # Update position history
            if 'position_history' not in track:
                track['position_history'] = []
            track['position_history'].append(detection['bbox'])
            if len(track['position_history']) > self.position_history_length:
                track['position_history'].pop(0)

            # Update features
            if detection['features'] is not None:
                if 'features' not in track:
                    track['features'] = []
                track['features'].append(detection['features'])
                if len(track['features']) > self.track_history_length:
                    track['features'].pop(0)

            # Update face embeddings
            if detection['face_embedding'] is not None:
                if 'face_embeddings' not in track:
                    track['face_embeddings'] = []
                track['face_embeddings'].append(detection['face_embedding'])
                if len(track['face_embeddings']) > 5:  # Keeping recent faces
                    track['face_embeddings'].pop(0)

        # create new tracks for unmatched detections
        for det_idx, detection in enumerate(detections):
            if det_idx not in all_matches:
                person_id = self.next_person_id
                self.next_person_id += 1

                new_track = {
                    'last_bbox': detection['bbox'],
                    'last_seen': self.frame_count,
                    'created_frame': self.frame_count,
                    'track_length': 1,
                    'features': [detection['features']] if detection['features'] is not None else [],
                    'face_embeddings': [detection['face_embedding']] if detection['face_embedding'] is not None else [],
                    'position_history': [detection['bbox']],
                    'unconfirmed_frames': 1  # Track initialization counter
                }

                self.active_tracks[person_id] = new_track
                all_matches[det_idx] = person_id
                print(f"New person detected with ID: {person_id}")

        # Move tracks to lost if they're no longer visible
        currently_visible = set(all_matches.values())
        newly_lost = [track_id for track_id in self.active_tracks.keys()
                     if track_id not in currently_visible]

        for track_id in newly_lost:
            self.lost_tracks[track_id] = self.active_tracks.pop(track_id)
            print(f"Person {track_id} lost from view")

        # Clean up old lost tracks
        self._cleanup_lost_tracks()

        # Returns confirmed assignments
        result_assignments = {}
        for pose_idx, person_id in all_matches.items():
            track = self.active_tracks.get(person_id, None)
            if track:
                if person_id in self.confirmed_tracks:
                    result_assignments[pose_idx] = person_id
                else:
                    track['unconfirmed_frames'] += 1
                    if track['unconfirmed_frames'] >= self.min_initial_frames:
                        self.confirmed_tracks.add(person_id)
                        result_assignments[pose_idx] = person_id

        return result_assignments



    def _cleanup_lost_tracks(self):
        """Remove tracks that have been lost too long"""
        current_frame = self.frame_count
        to_remove = []

        for track_id, track in self.lost_tracks.items():
            frames_lost = current_frame - track['last_seen']

            if frames_lost > self.max_lost_frames:
                to_remove.append(track_id)

        for track_id in to_remove:
            del self.lost_tracks[track_id]
            print(f"Permanently removed person {track_id} from tracking")

    def _validate_detection(self, bbox):
        """Validate detection based on size and aspect ratio"""
        x1, y1, x2, y2 = bbox
        width = x2 - x1
        height = y2 - y1

        return (height >= self.min_person_height and
                width >= self.min_person_width and
                width/height <= self.max_aspect_ratio)

    def _calculate_iou(self, box1, box2):
        """Calculate Intersection over Union"""
        x1_1, y1_1, x2_1, y2_1 = box1
        x1_2, y1_2, x2_2, y2_2 = box2

        x1_i = max(x1_1, x1_2)
        y1_i = max(y1_1, y1_2)
        x2_i = min(x2_1, x2_2)
        y2_i = min(y2_1, y2_2)

        if x2_i <= x1_i or y2_i <= y1_i:
            return 0.0

        intersection = (x2_i - x1_i) * (y2_i - y1_i)
        area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
        area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
        union = area1 + area2 - intersection

        return intersection / union if union > 0 else 0.0

    def _compare_features(self, feat1, feat2):
        """Compare features using cosine similarity"""
        if feat1 is None or feat2 is None:
            return 0.0
        try:
            feat1 = np.array(feat1).reshape(1, -1)
            feat2 = np.array(feat2).reshape(1, -1)
            similarity = cosine_similarity(feat1, feat2)[0][0]
            return max(0, similarity)
        except:
            return 0.0

def detect_faces(frame):

    mtcnn = MTCNN(keep_all=True, device=device)

    # Convert to RGB
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    boxes, probs = mtcnn.detect(frame_rgb)

    face_boxes = []
    if boxes is not None:
        for box, prob in zip(boxes, probs):
            if prob > 0.9:  # high confidence threshold
                x1, y1, x2, y2 = box.astype(int)
                face_img = frame[y1:y2, x1:x2]
                if face_img.size > 0:
                    face_boxes.append({
                        'box': [x1, y1, x2-x1, y2-y1],
                        'img': face_img,
                        'confidence': prob
                    })
    return face_boxes


def visualize_frame(frame, pose_results, person_assignments, visualizer):
    """Visualize frame with pose estimation and person IDs - only for tracked persons"""
    img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Filter pose results to only include tracked persons
    filtered_pose_results = []
    for pose_idx, pose in enumerate(pose_results):
        if pose_idx in person_assignments:
            filtered_pose_results.append(pose)

    # If no tracked persons, return original frame
    if not filtered_pose_results:
        return frame

    # Merge and visualize only the filtered poses
    data_samples = merge_data_samples(filtered_pose_results)

    visualizer.add_datasample(
        'result',
        img,
        data_sample=data_samples,
        draw_gt=False,
        draw_heatmap=False,
        draw_bbox=False,
        show=False,
        wait_time=0,
        out_file=None,
        kpt_thr=0.3
    )

    vis_result = visualizer.get_image()
    vis_result = cv2.cvtColor(vis_result, cv2.COLOR_RGB2BGR)

    # Draw bounding boxes and IDs for tracked persons
    for pose_idx, person_id in person_assignments.items():
        if pose_idx < len(pose_results):
            pose = pose_results[pose_idx]
            if hasattr(pose.pred_instances, 'bboxes') and len(pose.pred_instances.bboxes) > 0:
                bbox = pose.pred_instances.bboxes[0]
                if torch.is_tensor(bbox):
                    bbox = bbox.cpu().numpy()

                x1, y1, x2, y2 = bbox.astype(int)

                cv2.rectangle(vis_result, (x1, y1), (x2, y2), (0, 255, 0), 2)

                text = f"Person {person_id}"
                text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)[0]
                cv2.rectangle(vis_result, (x1, y1-35), (x1+text_size[0]+10, y1-5), (0, 255, 0), -1)
                cv2.putText(vis_result, text, (x1+5, y1-15),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2)

    return vis_result

def process_video(input_path, output_path, detector, pose_estimator, visualizer):
    """Process video with tracking"""
    cap = cv2.VideoCapture(input_path)
    if not cap.isOpened():
        print(f"Error: Could not open video {input_path}")
        return

    fps = 15
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    if not writer.isOpened():
        print(f"Error: Could not create video writer for {output_path}")
        cap.release()
        return

    tracker = PersonTracker(fps=fps)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    torch.cuda.empty_cache()

    with tqdm(total=frame_count, desc="Processing video") as pbar:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if pbar.n % 100 == 0:
                torch.cuda.empty_cache()

            scope = detector.cfg.get('default_scope', 'mmdet')
            if scope is not None:
                init_default_scope(scope)

            with torch.cuda.device(device):
                detect_result = inference_detector(detector, frame)

            pred_instance = detect_result.pred_instances.cpu().numpy()
            bboxes = np.concatenate(
                (pred_instance.bboxes, pred_instance.scores[:, None]), axis=1)
            bboxes = bboxes[np.logical_and(pred_instance.labels == 0,
                                        pred_instance.scores > 0.5)]
            bboxes = bboxes[nms(bboxes, 0.7)][:, :4]

            face_boxes = detect_faces(frame)

            with torch.cuda.device(device):
                pose_results = inference_topdown(pose_estimator, frame, bboxes)

            person_assignments = tracker.update(frame, pose_results, face_boxes)

            vis_frame = visualize_frame(frame, pose_results, person_assignments, visualizer)
            writer.write(vis_frame)
            pbar.update(1)

    cap.release()
    writer.release()
    torch.cuda.empty_cache()

# Main
input_folder = '/content/video'
output_folder = '/content/video_output'

print(f"Using device: {device}")

os.makedirs(output_folder, exist_ok=True)
video_files = [f for f in os.listdir(input_folder) if f.endswith('mp4')]

print(f"Found {len(video_files)} video(s).")

for video_file in video_files:
    input_path = os.path.join(input_folder, video_file)
    output_path = os.path.join(output_folder, video_file.replace('.mp4', '_tracked.mp4'))

    print(f"\nProcessing: {video_file}")
    process_video(input_path, output_path, detector, pose_estimator, visualizer)
    print(f"Saved: {output_path}")

print("videos saved to output folder.")

# Output and input BIDS format

In [None]:

import glob

#BIDS format input
input_folder = '/content/test_data/processed_output/bids-dataset/derivatives/preprocessed'  # Read from derivatives/preprocessed
output_folder = '/content/bids-dataset/derivatives/tracked'  # Output to derivatives/tracked

print(f"Using device: {device}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")


os.makedirs(output_folder, exist_ok=True)

# Find all preprocessed video files
video_pattern = os.path.join(input_folder, 'sub-*/ses-*/beh/*.mp4')
video_files = glob.glob(video_pattern)

print(f"Found {len(video_files)} preprocessed video(s) in BIDS derivatives.")

for video_path in video_files:
    # Example: /content/bids-dataset/derivatives/preprocessed/sub-01/ses-01/beh/sub-01_ses-01_task-play_desc-processed_beh.mp4
    path_parts = video_path.split(os.sep)

    # Find subject and session from path
    subject = None
    session = None
    for part in path_parts:
        if part.startswith('sub-'):
            subject = part
        elif part.startswith('ses-'):
            session = part

    video_filename = os.path.basename(video_path)

    # Create output directory following BIDS derivatives structure
    output_subject_dir = os.path.join(output_folder, subject, session, 'beh')
    os.makedirs(output_subject_dir, exist_ok=True)

    # Create output filename (replace desc-processed with desc-tracked)
    if 'desc-processed' in video_filename:
        output_filename = video_filename.replace('desc-processed', 'desc-tracked')
    else:
        # If no desc-processed, add desc-tracked before extension ( if we use raw data)
        output_filename = video_filename.replace('.mp4', '_desc-tracked.mp4')
    output_path = os.path.join(output_subject_dir, output_filename)

    print(f"\nProcessing: {subject}/{session}/{video_filename}")
    print(f"Input: {video_path}")
    print(f"Output: {output_path}")

    process_video(video_path, output_path, detector, pose_estimator, visualizer)
    print(f"Saved: {output_path}")

print("All preprocessed videos processed and saved in BIDS derivatives/tracked structure.")

# create a output_description.json for the derivatives
derivatives_description = {
    "Name": "Person Tracking with Pose Estimation",
    "BIDSVersion": "1.10.0",
    "GeneratedBy": [
        {
            "Name": "Custom Person Tracking Pipeline",
            "Version": "1.0",
            "Description": "Multi-person tracking with pose estimation using MMPose and face recognition"
        }
    ],
    "SourceDatasets": [
        {
            "URL": input_folder,
            "Version": ""
        }
    ]
}

import json
description_path = os.path.join(output_folder, 'output_description.json')
with open(description_path, 'w') as f:
    json.dump(derivatives_description, f, indent=2)

print(f"Created dataset description: {description_path}")