In [None]:
# yolo for pose estiamtion dinov3 for appearance feature extraction for each person

In [None]:
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
from huggingface_hub import login
login(token="")

In [None]:
from transformers import pipeline

pipe = pipeline("image-feature-extraction", model="facebook/dinov3-vith16plus-pretrain-lvd1689m")

In [None]:
!pip install ultralytics

# dinov3 only

In [None]:
import cv2
import numpy as np
import os
from ultralytics import YOLO
import pickle
from PIL import Image
from transformers import pipeline
import torch

tracks = {}           # Store all tracks: {track_id: track_info}
next_id = 1          # Next available track ID
frame_count = 0      # Current frame number
appearance_features = {}  # Store appearance features: {track_id: features}


APPEARANCE_THRESHOLD = 0.4         # Minimum appearance similarity (relaxed)
FEATURE_UPDATE_ALPHA = 0.9         # Weight for existing features (more stable)


MAX_MISSING_FRAMES = 30    # Remove track after this many frames
MIN_CONFIDENCE = 0.5       # Minimum detection confidence

# Colors for visualization
COLORS = [
    (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0),
    (255, 0, 255), (0, 255, 255), (128, 0, 128), (255, 128, 0)
]

# COCO pose skeleton for drawing
SKELETON = [
    [16, 14], [14, 12], [17, 15], [15, 13], [12, 13],
    [6, 12], [7, 13], [6, 7], [6, 8], [7, 9],
    [8, 10], [9, 11], [2, 3], [1, 2], [1, 3],
    [2, 4], [3, 5], [4, 6], [5, 7]
]

def load_dinov3_model():
    """Load DINOv3 model using transformers pipeline"""

    device = 0 if torch.cuda.is_available() else -1

    # pipeline
    dinov3_pipeline = pipeline(
        "image-feature-extraction",
        model="facebook/dinov3-vith16plus-pretrain-lvd1689m",
        device=device
    )

    return dinov3_pipeline

def extract_batch_features(image, detections, dinov3_pipeline):
    """Extract appearance features for a batch of detections"""
    crops = []
    h, w = image.shape[:2]

    for bbox in detections:
        x1, y1, x2, y2 = map(int, bbox[:4])
        x1 = max(0, min(x1, w-1))
        y1 = max(0, min(y1, h-1))
        x2 = max(x1+1, min(x2, w))
        y2 = max(y1+1, min(y2, h))

        crop = image[y1:y2, x1:x2]
        crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(crop_rgb).resize((224, 224))
        crops.append(pil_image)

    features_list = dinov3_pipeline(crops)

    processed_features = []
    for f in features_list:
        f = np.array(f).flatten()
        norm = np.linalg.norm(f)
        if norm > 0:
            f = f / norm
        processed_features.append(f)

    return processed_features

def calculate_appearance_similarity(features1, features2):
    """Calculate cosine similarity between appearance features"""
    dot_product = np.dot(features1, features2)
    norm1 = np.linalg.norm(features1)
    norm2 = np.linalg.norm(features2)

    if norm1 == 0 or norm2 == 0:
        return 0.0

    similarity = dot_product / (norm1 * norm2)
    return max(0, similarity)

def filter_detections(detections, keypoints_list):
    """Keep only good quality detections"""
    if not detections:
        return [], []

    filtered_detections = []
    filtered_keypoints = []

    for i, detection in enumerate(detections):
        x1, y1, x2, y2, conf = detection

        # confidence and box size
        width = x2 - x1
        height = y2 - y1
        area = width * height
        aspect_ratio = width / height if height > 0 else 0

        # Filter criteria
        if (conf > MIN_CONFIDENCE and
            area > 2000 and
            0.2 < aspect_ratio < 5.0 and
            width > 50 and height > 100):

            filtered_detections.append(detection)
            if i < len(keypoints_list):
                filtered_keypoints.append(keypoints_list[i])
            else:
                filtered_keypoints.append(None)

    return filtered_detections, filtered_keypoints

def create_new_track(detection, keypoints, features):
    """Create a new track"""
    global next_id, frame_count, tracks, appearance_features

    track_id = next_id
    next_id += 1

    tracks[track_id] = {
        'bbox': detection,
        'keypoints': keypoints,
        'missing_frames': 0,
        'last_seen': frame_count,
        'created_at': frame_count,
        'match_type': 'new'
    }

    # appearance features storing
    appearance_features[track_id] = features

    return track_id

def update_track(track_id, detection, keypoints, new_features, similarity):
    """Update existing track"""
    global tracks, appearance_features, frame_count

    tracks[track_id]['bbox'] = detection
    tracks[track_id]['keypoints'] = keypoints
    tracks[track_id]['missing_frames'] = 0
    tracks[track_id]['last_seen'] = frame_count
    tracks[track_id]['match_type'] = 'appearance'
    tracks[track_id]['last_similarity'] = similarity

    # Update appearance features with moving average
    old_features = appearance_features[track_id]
    appearance_features[track_id] = FEATURE_UPDATE_ALPHA * old_features + (1 - FEATURE_UPDATE_ALPHA) * new_features
    # Normalize
    norm = np.linalg.norm(appearance_features[track_id])
    if norm > 0:
        appearance_features[track_id] /= norm

def remove_old_tracks():
    """Remove tracks that are too old"""
    global tracks, appearance_features

    tracks_to_remove = []
    for track_id, track_info in tracks.items():
        if track_info['missing_frames'] >= MAX_MISSING_FRAMES:
            tracks_to_remove.append(track_id)

    for track_id in tracks_to_remove:
        del tracks[track_id]
        if track_id in appearance_features:
            del appearance_features[track_id]

def pure_appearance_matching(detections, keypoints_list, detection_features):
    """
    Pure appearance-only matching using DINOv3 features
    Returns: list of (detection_idx, track_id, similarity) tuples
    """
    global tracks, frame_count

    # Get active tracks
    active_track_ids = []
    for track_id, track_info in tracks.items():
        if track_info['missing_frames'] < MAX_MISSING_FRAMES:
            active_track_ids.append(track_id)

    if not active_track_ids:
        return []  # No tracks to match

    print(f"  Pure appearance matching: {len(detections)} detections vs {len(active_track_ids)} tracks")

    matches = []
    used_tracks = set()

    # For each detection, find the best matching track based
    for i, detection in enumerate(detections):
        det_features = detection_features[i]
        best_track_id = None
        best_similarity = 0.0

        # Check appearance similarity with all available tracks
        for track_id in active_track_ids:
            if track_id in used_tracks:
                continue  # Track already matched

            track_features = appearance_features[track_id]

            # Calculate ONLY appearance similarity
            similarity = calculate_appearance_similarity(det_features, track_features)

            # Check if this is the best match so far
            if similarity >= APPEARANCE_THRESHOLD and similarity > best_similarity:
                best_similarity = similarity
                best_track_id = track_id

        # If we found a good match, add it
        if best_track_id is not None:
            matches.append((i, best_track_id, best_similarity))
            used_tracks.add(best_track_id)
            print(f"    Match: detection {i} -> track {best_track_id} (similarity: {best_similarity:.3f})")

    print(f"  Found {len(matches)} pure appearance matches")
    return matches

def update_tracks(detections, keypoints_list, image, dinov3_pipeline):
    """Main tracking function"""
    global tracks, frame_count

    frame_count += 1

    # Filter detections
    detections, keypoints_list = filter_detections(detections, keypoints_list)

    if not detections:
        # No detections - increment missing frames for all tracks
        for track_id in tracks:
            tracks[track_id]['missing_frames'] += 1
        remove_old_tracks()
        return

    # Extract appearance features for all detections
    print(f"Extracting DINOv3 features for {len(detections)} detections...")
    detection_features = extract_batch_features(image, detections, dinov3_pipeline)

    # appearance-based matching
    matches = pure_appearance_matching(detections, keypoints_list, detection_features)

    # Update matched tracks
    matched_detection_indices = set()
    matched_track_ids = set()

    for detection_idx, track_id, similarity in matches:
        update_track(track_id, detections[detection_idx], keypoints_list[detection_idx],
                    detection_features[detection_idx], similarity)
        matched_detection_indices.add(detection_idx)
        matched_track_ids.add(track_id)

    # Create new tracks for unmatched detections
    new_tracks_count = 0
    for i, detection in enumerate(detections):
        if i not in matched_detection_indices:
            create_new_track(detection, keypoints_list[i], detection_features[i])
            new_tracks_count += 1

    # Increment missing frames for unmatched tracks
    for track_id, track_info in tracks.items():
        if track_id not in matched_track_ids and track_info['missing_frames'] < MAX_MISSING_FRAMES:
            tracks[track_id]['missing_frames'] += 1

    remove_old_tracks()

    # matching summary
    print(f"  Matching summary: {len(matches)} appearance matches, {new_tracks_count} new tracks")

def draw_pose(img, keypoints, color):
    """Draw pose keypoints and skeleton"""
    if keypoints is None:
        return img

    # Draw keypoints
    for i, (x, y, conf) in enumerate(keypoints):
        if conf > 0.3:
            cv2.circle(img, (int(x), int(y)), 4, color, -1)

    # Draw skeleton
    for connection in SKELETON:
        pt1_idx, pt2_idx = connection[0] - 1, connection[1] - 1

        if (pt1_idx < len(keypoints) and pt2_idx < len(keypoints) and
            keypoints[pt1_idx][2] > 0.3 and keypoints[pt2_idx][2] > 0.3):

            pt1 = (int(keypoints[pt1_idx][0]), int(keypoints[pt1_idx][1]))
            pt2 = (int(keypoints[pt2_idx][0]), int(keypoints[pt2_idx][1]))
            cv2.line(img, pt1, pt2, color, 2)

    return img

def draw_tracks(img, tracks):
    """Draw all active tracks with similarity scores"""
    active_tracks = {tid: info for tid, info in tracks.items()
                    if info['missing_frames'] <= 3}

    for track_id, track_info in active_tracks.items():

        color = COLORS[track_id % len(COLORS)]

        # Draw bbox
        bbox = track_info['bbox']
        x1, y1, x2, y2 = map(int, bbox[:4])

        # Different thickness for new vs matched tracks
        thickness = 2 if track_info.get('match_type') == 'appearance' else 3
        cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)

        # Draw track ID with similarity score if available
        label = f'ID:{track_id}'
        if 'last_similarity' in track_info:
            label += f' ({track_info["last_similarity"]:.2f})'
        elif track_info.get('match_type') == 'new':
            label += ' (NEW)'

        cv2.putText(img, label, (x1, y1-10),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

        # pose
        img = draw_pose(img, track_info['keypoints'], color)

    return img

def process_video(video_path, output_path):
    """Process a single video"""
    print(f"Processing: {video_path}")

    global tracks, next_id, frame_count, appearance_features
    tracks = {}
    next_id = 1
    frame_count = 0
    appearance_features = {}

    yolo_model = YOLO('yolov8n-pose.pt')

    dinov3_pipeline = load_dinov3_model()

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error: Cannot open video {video_path}")
        return

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))


    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # YOLO detection
        results = yolo_model(frame, verbose=False)

        # Extract person detections and poses
        detections = []
        keypoints_list = []

        if results[0].boxes is not None:
            boxes = results[0].boxes.xyxy.cpu().numpy()
            scores = results[0].boxes.conf.cpu().numpy()
            classes = results[0].boxes.cls.cpu().numpy()

            # Filter for person class (class 0)
            person_indices = classes == 0
            boxes = boxes[person_indices]
            scores = scores[person_indices]

            # Get keypoints if available
            if results[0].keypoints is not None:
                keypoints = results[0].keypoints.xy.cpu().numpy()[person_indices]
                keypoints_conf = results[0].keypoints.conf.cpu().numpy()[person_indices]

                for i in range(len(boxes)):
                    x1, y1, x2, y2 = boxes[i]
                    conf = scores[i]
                    detections.append([x1, y1, x2, y2, conf])

                    # Combine keypoint coordinates with confidence
                    kpts = []
                    for j in range(len(keypoints[i])):
                        x, y = keypoints[i][j]
                        c = keypoints_conf[i][j]
                        kpts.append([x, y, c])
                    keypoints_list.append(kpts)
            else:
                # No keypoints available
                for i in range(len(boxes)):
                    x1, y1, x2, y2 = boxes[i]
                    conf = scores[i]
                    detections.append([x1, y1, x2, y2, conf])
                    keypoints_list.append(None)

        # Update tracking
        update_tracks(detections, keypoints_list, frame, dinov3_pipeline)

        # results
        output_frame = draw_tracks(frame.copy(), tracks)

        # frame info
        active_count = len([t for t in tracks.values() if t['missing_frames'] <= 3])
        matched_count = len([t for t in tracks.values() if t.get('match_type') == 'appearance'])
        new_count = len([t for t in tracks.values() if t.get('match_type') == 'new'])

        info_text = f"Frame: {frame_count}/{total_frames} | Active: {active_count} | Matched: {matched_count} | New: {new_count}"
        cv2.putText(output_frame, info_text, (10, 30),
                   cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)

        writer.write(output_frame)

        if frame_count % 50 == 0:
            print(f"Processed {frame_count}/{total_frames} frames")

    cap.release()
    writer.release()

    print(f"Video saved to: {output_path}")
    print(f"Total unique people tracked: {next_id - 1}")

    tracking_data = {
        'video_file': os.path.basename(video_path),
        'tracks': tracks,
        'appearance_features': appearance_features,
        'total_frames': frame_count,
        'fps': fps,
        'parameters': {
            'appearance_threshold': APPEARANCE_THRESHOLD,
            'feature_update_alpha': FEATURE_UPDATE_ALPHA
        }
    }

    data_file = output_path.replace('.mp4', '_pure_appearance_tracking_data.pkl')
    with open(data_file, 'wb') as f:
        pickle.dump(tracking_data, f)
    print(f"Tracking data saved to: {data_file}")

def main():
    """Main function to process all videos"""

    input_folder = '/kaggle/input/360p-video/you_video360p'
    output_folder = '/kaggle/working/pure_appearance_tracking_output'
    os.makedirs(output_folder, exist_ok=True)

    video_extensions = ['.mp4', '.avi', '.mov', '.mkv']
    video_files = []

    if os.path.exists(input_folder):
        for file in os.listdir(input_folder):
            if any(file.lower().endswith(ext) for ext in video_extensions):
                video_files.append(file)
    else:
        print(f"Input folder not found: {input_folder}")
        return

    print(f"Found {len(video_files)} video(s) to process")
    print(f"Using PURE appearance matching ONLY (threshold ≥{APPEARANCE_THRESHOLD})")

    for video_file in video_files:
        print(f"\n=== Processing: {video_file} ===")

        video_path = os.path.join(input_folder, video_file)
        output_path = os.path.join(output_folder,
                                  video_file.replace('.mp4', 'appearance_tracked.mp4'))

        try:
            process_video(video_path, output_path)
        except Exception as e:
            print(f"Error processing {video_file}: {e}")
            import traceback
            traceback.print_exc()
            continue

    print("\nAll videos processed ")


if __name__ == "__main__":
    main()