In [None]:
!pip install --upgrade "scenedetect[opencv]"



In [None]:
!pip install mediapipe==0.10.14 librosa



### **Download Test Video from Google Drive Link**

In [None]:
import os

# Download the video from Google Drive
!gdown --id 1Bw3MkBhJ_TusVyJ7Po23aiTOCMIc0bYx -O /content/input_video.mp4

# Verify if the file exists
if os.path.exists("/content/input_video.mp4"):
  print("Video downloaded successfully!")
else:
  print("Video download failed.")


Downloading...
From (original): https://drive.google.com/uc?id=1Bw3MkBhJ_TusVyJ7Po23aiTOCMIc0bYx
From (redirected): https://drive.google.com/uc?id=1Bw3MkBhJ_TusVyJ7Po23aiTOCMIc0bYx&confirm=t&uuid=28ab91ba-f180-4782-9cfe-333960ea1ff7
To: /content/input_video.mp4
100% 187M/187M [00:03<00:00, 47.1MB/s]
Video downloaded successfully!


### **UTILITY FILES**

In [None]:
import numpy as np
from collections import deque

class VirtualCamera:
    """
    Simulates physical camera movement with momentum and damping for smooth tracking.
    This creates natural, cinematographic movements by simulating camera physics.
    """
    def __init__(self, initial_position=(0, 0), damping=0.85, spring=0.15, mass=1.0):
        # Camera position and movement properties
        self.position = initial_position  # Current center position
        self.velocity = (0, 0)  # Current velocity vector
        self.target = initial_position  # Target position

        # Physics parameters
        self.mass = mass  # Virtual camera "weight"
        self.damping = damping  # Damping factor (higher = smoother but slower)
        self.spring = spring  # Spring factor (higher = faster snap to target)

        # Additional stability controls
        self.min_movement_threshold = 0.5  # Ignore tiny movements below this threshold
        self.position_history = deque(maxlen=30)
        self.target_history = deque(maxlen=30)

        # Shot transition properties
        self.in_transition = False
        self.transition_progress = 0
        self.transition_start_pos = None
        self.transition_target = None
        self.transition_duration = 30  # frames

    def update(self, target_pos, force_immediate=False):
        """
        Update camera position with physics simulation.
        Returns the new camera position as (x, y) integers.

        Args:
            target_pos: The target position to move toward
            force_immediate: If True, jump immediately to target (for scene cuts)
        """
        self.target = target_pos
        self.target_history.append(target_pos)

        # For scene cuts or initialization, jump immediately
        if force_immediate:
            self.position = target_pos
            self.velocity = (0, 0)
            self.position_history.clear()
            self.position_history.append(self.position)
            return (int(self.position[0]), int(self.position[1]))

        # Handle shot transitions with easing
        if self.in_transition:
            self.transition_progress += 1
            if self.transition_progress >= self.transition_duration:
                self.in_transition = False
                self.position = self.transition_target
            else:
                # Cubic easing (smooth acceleration and deceleration)
                t = self.transition_progress / self.transition_duration
                t = t * t * (3 - 2 * t)  # Cubic easing formula

                # Interpolate between start and target positions
                self.position = (
                    self.transition_start_pos[0] + (self.transition_target[0] - self.transition_start_pos[0]) * t,
                    self.transition_start_pos[1] + (self.transition_target[1] - self.transition_start_pos[1]) * t
                )
                self.position_history.append(self.position)
                return (int(self.position[0]), int(self.position[1]))

        # Check if target has been stable for a while and we're approaching it
        if len(self.target_history) > 10:
            recent_targets = list(self.target_history)[-10:]
            target_x_std = np.std([t[0] for t in recent_targets])
            target_y_std = np.std([t[1] for t in recent_targets])

            # If target is very stable and we're close, lock onto it more firmly
            distance_to_target = np.sqrt((self.position[0] - target_pos[0])**2 +
                                        (self.position[1] - target_pos[1])**2)

            if target_x_std < 5 and target_y_std < 5 and distance_to_target < 20:
                # Increase spring force for faster convergence on stable targets
                spring = self.spring * 2
            else:
                spring = self.spring
        else:
            spring = self.spring

        # Calculate spring force toward target (with deadzone for tiny movements)
        dx = self.target[0] - self.position[0]
        dy = self.target[1] - self.position[1]

        # Apply deadzone to reduce jitter
        if abs(dx) < self.min_movement_threshold:
            dx = 0
        if abs(dy) < self.min_movement_threshold:
            dy = 0

        force_x = dx * spring
        force_y = dy * spring

        # Apply force to velocity (F = ma)
        accel_x = force_x / self.mass
        accel_y = force_y / self.mass

        # Update velocity with acceleration and damping
        self.velocity = (
            self.velocity[0] * self.damping + accel_x,
            self.velocity[1] * self.damping + accel_y
        )

        # Update position
        new_x = self.position[0] + self.velocity[0]
        new_y = self.position[1] + self.velocity[1]
        self.position = (new_x, new_y)

        # Add to position history
        self.position_history.append(self.position)

        return (int(self.position[0]), int(self.position[1]))

    def start_transition(self, target_pos, duration=30):
        """Start a smooth transition to a new position (e.g., for scene changes)"""
        self.in_transition = True
        self.transition_progress = 0
        self.transition_start_pos = self.position
        self.transition_target = target_pos
        self.transition_duration = duration

    def get_long_term_stability_factor(self):
        """
        Calculate how stable the camera has been recently.
        Returns a value between 0 (unstable) and 1 (very stable).
        """
        if len(self.position_history) < 10:
            return 0.5  # Default mid-range value when history is short

        recent_positions = list(self.position_history)[-10:]
        x_std = np.std([p[0] for p in recent_positions])
        y_std = np.std([p[1] for p in recent_positions])

        # Normalize the stability factor (lower std = higher stability)
        stability = 1.0 - min(1.0, (x_std + y_std) / 50.0)
        return stability


In [None]:
import cv2
import numpy as np
from collections import Counter

class SceneContentAnalyzer:
    """
    Analyzes scene content to determine the best framing strategy.
    Handles scenes with and without people, identifying key visual elements.
    """
    def __init__(self, face_detector, pose_detector):
        self.face_detector = face_detector
        self.pose_detector = pose_detector

        # For saliency detection (non-human scenes)
        self.saliency = cv2.saliency.StaticSaliencySpectralResidual_create()

        # Content type thresholds
        self.people_presence_threshold = 0.25  # Min ratio of frames that must contain people
        self.min_scene_frames = 5  # Minimum frames to analyze for a scene

    def analyze_scene_frames(self, frames):
        """
        Analyze a collection of frames from a scene to determine content type and best framing.

        Returns:
            dict: Analysis results including content_type and framing strategy
        """
        if len(frames) < self.min_scene_frames:
            # Default to center framing for very short scenes
            return {
                "content_type": "unknown",
                "framing_strategy": "center",
                "people_ratio": 0,
                "has_motion": False
            }

        # Count frames with people
        frames_with_people = 0
        face_positions = []
        pose_positions = []

        # Sample frames if there are many (for efficiency)
        if len(frames) > 20:
            sample_frames = frames[::len(frames)//20]  # Take ~20 frames evenly distributed
        else:
            sample_frames = frames

        for frame in sample_frames:
            # Check for faces
            faces = self.face_detector.detect_faces(frame)
            # Check for people/poses
            people = self.pose_detector.detect_people(frame)

            if faces or people:
                frames_with_people += 1

                # Collect positions for later analysis
                for x, y, w, h, _ in faces:
                    face_positions.append((x + w//2, y + h//2))

                for x, y, _ in people:
                    pose_positions.append((x, y))

        # Calculate ratio of frames containing people
        people_ratio = frames_with_people / len(sample_frames)

        # Check for motion in the scene
        has_motion = self.detect_scene_motion(sample_frames)

        # Determine scene content type and framing strategy
        if people_ratio >= self.people_presence_threshold:
            # People are present in significant portion of the scene
            content_type = "people"

            # Find most common/important person position
            if face_positions or pose_positions:
                all_positions = face_positions + pose_positions
                if all_positions:
                    # Calculate centroid of people positions
                    centroid_x = sum(p[0] for p in all_positions) / len(all_positions)
                    centroid_y = sum(p[1] for p in all_positions) / len(all_positions)
                    framing_strategy = "track_people"
                    center_of_interest = (int(centroid_x), int(centroid_y))
                else:
                    framing_strategy = "center"
                    center_of_interest = self.get_frame_center(frames[0])
            else:
                framing_strategy = "center"
                center_of_interest = self.get_frame_center(frames[0])
        else:
            # Few or no people - analyze visual saliency and motion
            content_type = "no_people"

            if has_motion:
                framing_strategy = "follow_motion"
                center_of_interest = self.find_motion_center(sample_frames)
            else:
                # Use visual saliency for static scenes without people
                framing_strategy = "visual_saliency"
                center_of_interest = self.find_salient_region(sample_frames)

        return {
            "content_type": content_type,
            "framing_strategy": framing_strategy,
            "people_ratio": people_ratio,
            "has_motion": has_motion,
            "center_of_interest": center_of_interest
        }

    def detect_scene_motion(self, frames, threshold=3.0):
        """Detect if there is significant motion in the scene"""
        if len(frames) < 3:
            return False

        # Sample frames for efficiency
        if len(frames) > 6:
            sample_indices = np.linspace(0, len(frames)-1, 6, dtype=int)
            sample_frames = [frames[i] for i in sample_indices]
        else:
            sample_frames = frames

        # Convert to grayscale
        gray_frames = [cv2.cvtColor(f, cv2.COLOR_BGR2GRAY) for f in sample_frames]

        # Calculate motion between consecutive frames
        motion_scores = []
        for i in range(len(gray_frames)-1):
            # Simple motion detection using frame difference
            diff = cv2.absdiff(gray_frames[i], gray_frames[i+1])
            motion_score = np.mean(diff)
            motion_scores.append(motion_score)

        avg_motion = np.mean(motion_scores) if motion_scores else 0
        return avg_motion > threshold

    def find_motion_center(self, frames):
        """Find the center of motion in a sequence of frames"""
        if len(frames) < 3:
            return self.get_frame_center(frames[0])

        # Use optical flow to track motion
        prev_gray = cv2.cvtColor(frames[0], cv2.COLOR_BGR2GRAY)
        motion_points = []

        for i in range(1, min(len(frames), 10)):  # Process up to 10 frames
            gray = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)

            # Calculate optical flow
            flow = cv2.calcOpticalFlowFarneback(
                prev_gray, gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)

            # Get magnitude and angle of flow
            mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])

            # Find points with significant motion
            significant_motion = mag > np.percentile(mag, 90)  # Top 10% of motion
            y_indices, x_indices = np.where(significant_motion)

            if len(y_indices) > 0:
                # Use weighted average based on magnitude
                weights = [mag[y, x] for y, x in zip(y_indices, x_indices)]
                total_weight = sum(weights)
                if total_weight > 0:
                    center_x = sum(x * w for x, w in zip(x_indices, weights)) / total_weight
                    center_y = sum(y * w for y, w in zip(y_indices, weights)) / total_weight
                    motion_points.append((int(center_x), int(center_y)))

            prev_gray = gray

        if motion_points:
            # Calculate weighted average of motion centers
            x_avg = sum(p[0] for p in motion_points) / len(motion_points)
            y_avg = sum(p[1] for p in motion_points) / len(motion_points)
            return (int(x_avg), int(y_avg))
        else:
            return self.get_frame_center(frames[0])

    def find_salient_region(self, frames):
        """Find visually salient region in frames using saliency detection"""
        saliency_maps = []

        # Calculate saliency for each frame
        for frame in frames[:min(5, len(frames))]:  # Use up to 5 frames
            success, saliency_map = self.saliency.computeSaliency(frame)
            if success:
                # Normalize to 0-255 range
                saliency_map = (saliency_map * 255).astype('uint8')
                saliency_maps.append(saliency_map)

        if not saliency_maps:
            return self.get_frame_center(frames[0])

        # Combine saliency maps
        combined_saliency = np.mean(saliency_maps, axis=0)

        # Find center of most salient region
        _, thresh_map = cv2.threshold(
            combined_saliency,
            0.7 * np.max(combined_saliency),
            255,
            cv2.THRESH_BINARY
        )

        # Find connected components
        num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(
            thresh_map.astype(np.uint8),
            connectivity=8
        )

        # Find largest component (excluding background at index 0)
        if num_labels > 1:
            max_area = 0
            max_idx = 0
            for i in range(1, num_labels):  # Skip background
                area = stats[i, cv2.CC_STAT_AREA]
                if area > max_area:
                    max_area = area
                    max_idx = i

            return (int(centroids[max_idx][0]), int(centroids[max_idx][1]))
        else:
            # Rule of thirds positioning if no clear salient region
            h, w = frames[0].shape[:2]
            return (int(w * 2/3), int(h * 1/3))  # Upper-right third intersection

    def get_frame_center(self, frame):
        """Get the center point of a frame"""
        h, w = frame.shape[:2]
        return (w // 2, h // 2)


In [None]:
import numpy as np
from collections import deque

class SmoothCropper:
    """
    Creates stable, smooth crop windows with cinematic movement.
    Handles different aspect ratios while maintaining visual stability.
    """
    def __init__(self, target_aspect_ratio=9/16, transition_frames=30):
        self.target_aspect_ratio = target_aspect_ratio
        self.transition_frames = transition_frames

        # Crop window history
        self.crop_history = deque(maxlen=60)  # Store up to 2 seconds at 30fps

        # Current crop state
        self.current_crop = None
        self.target_crop = None

        # Transition state
        self.in_transition = False
        self.transition_progress = 0
        self.transition_start_crop = None
        self.transition_end_crop = None

        # Anti-jitter settings
        self.min_movement_threshold = 5  # Pixels
        self.hysteresis_ratio = 0.2  # Move only if change is significant

        # Content-aware framing options
        self.headroom_ratio = 0.1  # Extra space above detected faces

    def calculate_crop(self, frame, focus_point, content_type="people"):
        """
        Calculate the optimal crop window centered on the focus point.

        Args:
            frame: The video frame
            focus_point: (x, y) center point to focus on
            content_type: Type of content ("people", "motion", etc.)

        Returns:
            dict: Crop parameters (left, right, top, bottom)
        """
        h, w = frame.shape[:2]

        # Calculate target width for given aspect ratio
        target_width = int(h * self.target_aspect_ratio)
        if target_width > w:
            # Handle case where target width exceeds frame width
            target_width = w

        # Adjust focus point based on content type
        adjusted_focus_x, adjusted_focus_y = focus_point

        if content_type == "people":
            # Add headroom for people (shift focus point down slightly)
            headroom = int(h * self.headroom_ratio)
            adjusted_focus_y = min(h - 1, adjusted_focus_y + headroom)

        # Calculate initial crop boundaries centered on adjusted focus point
        half_width = target_width // 2
        left = adjusted_focus_x - half_width
        right = adjusted_focus_x + half_width

        # Adjust if crop goes out of bounds
        if left < 0:
            left = 0
            right = target_width
        elif right > w:
            right = w
            left = max(0, w - target_width)

        # Ensure exact width needed for aspect ratio
        if right - left != target_width:
            right = min(w, left + target_width)

        crop = {
            "left": int(left),
            "right": int(right),
            "top": 0,
            "bottom": h
        }

        return crop

    def apply_smooth_crop(self, frame, focus_point, force_update=False,
                         content_type="people", new_shot=False):
        """
        Apply a smoothed crop to the frame based on focus point.

        Args:
            frame: The video frame
            focus_point: (x, y) center point to focus on
            force_update: Force update even if movement is small
            content_type: Type of content for content-aware framing
            new_shot: True if this is the start of a new shot

        Returns:
            tuple: (cropped_frame, crop_data)
        """
        h, w = frame.shape[:2]

        # Calculate target crop based on current focus point
        target_crop = self.calculate_crop(frame, focus_point, content_type)

        # For first frame or new shot
        if self.current_crop is None or new_shot:
            self.current_crop = target_crop
            self.crop_history.clear()
            self.crop_history.append(self.current_crop)

            # Apply crop
            cropped = frame[:, target_crop["left"]:target_crop["right"]]
            return cropped, target_crop

        # Determine if we need to transition to a new crop
        if self.in_transition:
            # Continue existing transition
            self.transition_progress += 1
            progress_ratio = self.transition_progress / self.transition_frames

            if self.transition_progress >= self.transition_frames:
                # Transition complete
                self.in_transition = False
                self.current_crop = self.transition_end_crop
            else:
                # Apply easing function for smooth transition
                t = progress_ratio
                # Cubic easing
                t = t * t * (3 - 2 * t)

                # Interpolate crop values
                left = int(self.transition_start_crop["left"] +
                          (self.transition_end_crop["left"] - self.transition_start_crop["left"]) * t)
                right = int(self.transition_start_crop["right"] +
                           (self.transition_end_crop["right"] - self.transition_start_crop["right"]) * t)

                self.current_crop = {
                    "left": left,
                    "right": right,
                    "top": 0,
                    "bottom": h
                }
        else:
            # Check if the target crop is significantly different from current crop
            current_center = (self.current_crop["left"] + self.current_crop["right"]) // 2
            target_center = (target_crop["left"] + target_crop["right"]) // 2

            movement = abs(current_center - target_center)

            # Determine if we should start a new transition
            if force_update or movement > self.min_movement_threshold:
                significant_change = movement > self.hysteresis_ratio * (target_crop["right"] - target_crop["left"])

                if significant_change or force_update:
                    # Start new transition
                    self.in_transition = True
                    self.transition_progress = 0
                    self.transition_start_crop = self.current_crop.copy()
                    self.transition_end_crop = target_crop.copy()

        # Add current crop to history
        self.crop_history.append(self.current_crop)

        # Create cropped frame
        cropped = frame[:, self.current_crop["left"]:self.current_crop["right"]]

        return cropped, self.current_crop

    def long_term_stabilization(self):
        """
        Apply long-term stabilization by analyzing the history of crop windows.
        Helps reduce slow drift and maintain more stable framing.

        Should be called periodically during stable scenes.
        """
        if len(self.crop_history) < 15:
            return

        # Extract centers of recent crops
        recent_crops = list(self.crop_history)[-15:]
        centers = [(c["left"] + c["right"]) // 2 for c in recent_crops]

        # Check if centers are relatively stable
        center_std = np.std(centers)

        if center_std < 10:  # Very stable
            # Calculate average center position
            avg_center = int(np.mean(centers))
            crop_width = self.current_crop["right"] - self.current_crop["left"]

            # Create a stabilized crop around this center
            stable_left = avg_center - crop_width // 2
            stable_right = avg_center + crop_width // 2

            # Apply small correction toward stable position
            current_center = (self.current_crop["left"] + self.current_crop["right"]) // 2
            diff = avg_center - current_center

            if abs(diff) > 3:  # Only correct if difference is noticeable
                correction = int(diff * 0.2)  # Apply 20% correction
                self.current_crop["left"] += correction
                self.current_crop["right"] += correction


### **MAIN PIPELINE**

In [None]:
# MAIN
import cv2
import numpy as np
import scenedetect
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
import mediapipe as mp
import json
import os
import subprocess
from tqdm import tqdm
import collections
import time
#from virtual_camera import VirtualCamera
#from scene_analyzer import SceneContentAnalyzer
#from smooth_cropper import SmoothCropper


class ImprovedH2VProcessor:
    def __init__(self):
        # Initialize MediaPipe solutions
        self.mp_face_detection = mp.solutions.face_detection
        self.mp_face_detection_model = self.mp_face_detection.FaceDetection(min_detection_confidence=0.5)
        self.mp_pose = mp.solutions.pose
        self.mp_pose_model = self.mp_pose.Pose(min_detection_confidence=0.5,
                                              min_tracking_confidence=0.5)

        # Initialize advanced components
        self.virtual_camera = VirtualCamera()
        self.scene_analyzer = SceneContentAnalyzer(self, self)  # Pass self as face/pose detector
        self.smooth_cropper = SmoothCropper(target_aspect_ratio=9/16)

        # Scene tracking
        self.current_shot_id = 0
        self.shot_boundaries = []
        self.current_scene_frames = []
        self.current_scene_analysis = None
        self.scenes_without_people = []

        # Tracking data
        self.focus_points_data = []
        self.prev_frame = None

        # Stabilization history
        self.prev_focus_point = None
        self.focus_point_history = collections.deque(maxlen=30)

        # Enhanced stability parameters
        self.temporal_smoothing_window = 15  # Frames for temporal averaging
        self.min_confidence_threshold = 0.7  # Minimum confidence for detection
        self.static_scene_counter = 0  # Counter for static scenes
        self.is_static_scene = False
        self.static_scene_focus_point = None

    def detect_scenes(self, video_path):
        """Split video into scenes using PySceneDetect with higher threshold for stability"""
        video_manager = VideoManager([video_path])
        scene_manager = SceneManager()
        # Higher threshold means fewer scene changes, more stability
        scene_manager.add_detector(ContentDetector(threshold=40))

        video_manager.start()
        scene_manager.detect_scenes(frame_source=video_manager)

        scene_list = scene_manager.get_scene_list()
        # Store scene boundaries for reference during processing
        self.shot_boundaries = [(scene[0].frame_num, scene[1].frame_num) for scene in scene_list]
        return scene_list

    def detect_faces(self, frame):
        """Detect faces in the frame"""
        results = self.mp_face_detection_model.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        faces = []
        if results.detections:
            for detection in results.detections:
                bboxC = detection.location_data.relative_bounding_box
                ih, iw, _ = frame.shape
                x, y, w, h = int(bboxC.xmin * iw), int(bboxC.ymin * ih), \
                             int(bboxC.width * iw), int(bboxC.height * ih)
                # Add confidence score to help with stability
                confidence = detection.score[0]
                faces.append((x, y, w, h, confidence))
        return faces

    def detect_people(self, frame):
        """Detect people using pose estimation"""
        results = self.mp_pose_model.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        people = []
        if results.pose_landmarks:
            h, w, _ = frame.shape
            # Get upper body landmarks for better stability
            landmarks = results.pose_landmarks.landmark

            # Check if key points are visible (nose, shoulders, hips)
            key_points = [0, 11, 12, 23, 24]  # Indices for nose, shoulders, and hips
            visible_confidence = sum(landmarks[i].visibility for i in key_points) / len(key_points)

            if visible_confidence > 0.7:  # Only track if confident
                # Calculate center of mass from upper body landmarks for stability
                upper_body_x = np.mean([landmarks[i].x for i in key_points if landmarks[i].visibility > 0.5]) * w
                upper_body_y = np.mean([landmarks[i].y for i in key_points if landmarks[i].visibility > 0.5]) * h

                people.append((int(upper_body_x), int(upper_body_y), visible_confidence))

        return people

    def detect_motion(self, prev_frame, curr_frame):
        """Detect significant motion between frames using optical flow"""
        if prev_frame is None:
            return None

        # Convert frames to grayscale
        prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
        curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)

        # Calculate sparse optical flow using Lucas-Kanade method
        # This is more stable than dense optical flow
        feature_params = dict(maxCorners=100, qualityLevel=0.3, minDistance=7, blockSize=7)
        prev_points = cv2.goodFeaturesToTrack(prev_gray, mask=None, **feature_params)

        if prev_points is None or len(prev_points) == 0:
            return None

        # Use Lucas-Kanade optical flow
        lk_params = dict(winSize=(15, 15), maxLevel=2,
                       criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))

        next_points, status, _ = cv2.calcOpticalFlowPyrLK(prev_gray, curr_gray, prev_points, None, **lk_params)

        if next_points is None:
            return None

        # Select good points
        good_new = next_points[status == 1]
        good_old = prev_points[status == 1]

        if len(good_new) == 0:
            return None

        # Calculate the movement vectors
        movements = good_new - good_old

        # Calculate magnitudes for each point
        magnitudes = np.sqrt(np.sum(movements**2, axis=1))

        # If no significant motion, return None
        if np.max(magnitudes) < 5:  # Threshold for significant motion
            return None

        # Find the point with maximum movement
        max_idx = np.argmax(magnitudes)
        motion_center = (int(good_new[max_idx][0][0]), int(good_new[max_idx][0][1]))

        return motion_center

    def _get_weighted_focus_point(self, frame, is_new_shot=False):
        """
        Calculate focus point using weighted detection results
        with temporal smoothing for stability
        """
        h, w, _ = frame.shape
        frame_center = (w // 2, h // 2)

        # Reset for new shots
        if is_new_shot:
            self.focus_point_history.clear()
            self.prev_focus_point = None
            self.is_static_scene = False
            self.static_scene_counter = 0
            self.static_scene_focus_point = None
            # Return frame center for first frame of a new shot
            return frame_center

        # Detect faces with higher priority
        faces = self.detect_faces(frame)
        faces = [f for f in faces if f[4] > self.min_confidence_threshold]  # Filter by confidence

        # If faces found, use them as primary focus
        if faces:
            # Use the largest/most confident face
            face_centers = [(f[0] + f[2]//2, f[1] + f[3]//2, f[4]) for f in faces]

            # Sort by confidence
            face_centers.sort(key=lambda x: x[2], reverse=True)

            # If multiple faces, use weighted average of top faces
            if len(face_centers) > 1:
                # Use top 2 faces only to avoid jumping between many faces
                top_faces = face_centers[:2]
                total_weight = sum(face[2] for face in top_faces)
                focus_x = sum(face[0] * face[2] for face in top_faces) / total_weight
                focus_y = sum(face[1] * face[2] for face in top_faces) / total_weight
                focus_point = (int(focus_x), int(focus_y))
            else:
                focus_point = (face_centers[0][0], face_centers[0][1])

            # Add to history with high confidence
            focus_confidence = 1.0

        else:
            # No faces, try pose detection
            people = self.detect_people(frame)

            if people:
                # Sort by confidence
                people.sort(key=lambda x: x[2], reverse=True)

                if len(people) > 1:
                    # Weighted average of top 2 people
                    top_people = people[:2]
                    total_weight = sum(person[2] for person in top_people)
                    focus_x = sum(person[0] * person[2] for person in top_people) / total_weight
                    focus_y = sum(person[1] * person[2] for person in top_people) / total_weight
                    focus_point = (int(focus_x), int(focus_y))
                else:
                    focus_point = (people[0][0], people[0][1])

                # Add to history with medium confidence
                focus_confidence = 0.8

            else:
                # No people, try motion detection only if not already in static scene mode
                if not self.is_static_scene and self.prev_frame is not None:
                    motion_center = self.detect_motion(self.prev_frame, frame)

                    if motion_center:
                        focus_point = motion_center
                        # Motion is less reliable, use lower confidence
                        focus_confidence = 0.6
                        # Reset static scene counter when motion detected
                        self.static_scene_counter = 0
                        self.is_static_scene = False
                    else:
                        # No motion detected, increment static counter
                        self.static_scene_counter += 1

                        # If static for multiple frames, switch to static scene mode
                        if self.static_scene_counter > 30:  # ~1 second at 30fps
                            self.is_static_scene = True

                            # Use scene analysis to find compositional focus point
                            if not self.static_scene_focus_point:
                                # Analyze frame for visual saliency
                                self.static_scene_focus_point = self.scene_analyzer.find_salient_region([frame])

                            focus_point = self.static_scene_focus_point
                            focus_confidence = 0.7  # Medium-high confidence once established
                        elif self.prev_focus_point:
                            # Use previous focus point with decay
                            focus_point = self.prev_focus_point
                            focus_confidence = 0.5  # Lower confidence for continued use
                        else:
                            # Fallback to center
                            focus_point = frame_center
                            focus_confidence = 0.3  # Low confidence
                else:
                    # Already in static scene mode or no previous frame
                    if self.is_static_scene and self.static_scene_focus_point:
                        focus_point = self.static_scene_focus_point
                        focus_confidence = 0.7
                    elif self.prev_focus_point:
                        focus_point = self.prev_focus_point
                        focus_confidence = 0.5
                    else:
                        focus_point = frame_center
                        focus_confidence = 0.3

        # Apply temporal smoothing for stability
        if self.prev_focus_point and not is_new_shot:
            # Add confidence-weighted current detection to history
            self.focus_point_history.append((focus_point, focus_confidence))

            # Calculate temporally smoothed position using weighted average
            # Weight more recent frames higher for responsiveness
            if len(self.focus_point_history) >= 3:
                total_weight = 0
                weighted_x = 0
                weighted_y = 0

                # Apply exponential weighting to favor more recent points
                for i, (point, conf) in enumerate(self.focus_point_history):
                    # More recent points get exponentially higher weights
                    recency_weight = np.exp(i / 10)  # Exponential growth factor
                    weight = conf * recency_weight
                    weighted_x += point[0] * weight
                    weighted_y += point[1] * weight
                    total_weight += weight

                smoothed_x = int(weighted_x / total_weight)
                smoothed_y = int(weighted_y / total_weight)
                smoothed_focus = (smoothed_x, smoothed_y)

                # Additional hysteresis: Move only part way to new position
                if self.prev_focus_point:
                    # Calculate distance
                    dist = np.sqrt((smoothed_focus[0] - self.prev_focus_point[0])**2 +
                                 (smoothed_focus[1] - self.prev_focus_point[1])**2)

                    # Apply stronger smoothing for small movements
                    if dist < 20:  # Small movement
                        # Move only 30% of the way to new position for small changes
                        lerp_factor = 0.3
                    elif dist < 50:  # Medium movement
                        # Move 50% of the way
                        lerp_factor = 0.5
                    else:  # Large movement - likely intentional
                        # Move 70% of the way
                        lerp_factor = 0.7

                    final_x = int(self.prev_focus_point[0] + lerp_factor * (smoothed_focus[0] - self.prev_focus_point[0]))
                    final_y = int(self.prev_focus_point[1] + lerp_factor * (smoothed_focus[1] - self.prev_focus_point[1]))
                    smoothed_focus = (final_x, final_y)
            else:
                smoothed_focus = focus_point
        else:
            smoothed_focus = focus_point
            self.focus_point_history.append((focus_point, focus_confidence))

        # Update previous focus point
        self.prev_focus_point = smoothed_focus
        return smoothed_focus

    def process_video(self, input_path, output_path, output_data_path=None):
        """
        Process video using optimized H2V algorithm with improved stability.

        Args:
            input_path: Path to input horizontal video
            output_path: Path to output vertical video
            output_data_path: Path to save focus point data (optional)
        """
        # Detect scenes first
        scenes = self.detect_scenes(input_path)
        print(f"Detected {len(scenes)} scenes")

        # Open input video
        cap = cv2.VideoCapture(input_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        # Prepare output video
        output_height = height
        output_width = int(height * 9/16)  # 9:16 aspect ratio

        # Use lossless intermediate codec
        fourcc = cv2.VideoWriter_fourcc(*'MJPG')
        temp_output_path = output_path + ".temp.avi"
        out = cv2.VideoWriter(temp_output_path, fourcc, fps, (output_width, output_height))

        # Track current scene
        current_scene_idx = 0
        current_frame_idx = 0
        buffer_frames = []  # For analyzing scene content before processing
        scene_buffer_size = min(30, int(fps))  # Buffer up to 1 second or 30 frames
        is_new_shot = True
        is_people_scene = True

        print("Processing video...")
        progress_bar = tqdm(total=total_frames)

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            current_frame_idx += 1
            progress_bar.update(1)

            # Check if we're entering a new scene
            if (current_scene_idx < len(scenes) and
                current_frame_idx >= scenes[current_scene_idx][1].frame_num):
                current_scene_idx += 1
                is_new_shot = True
                buffer_frames = []

                # Reset stabilization
                self.virtual_camera = VirtualCamera()
                self.focus_point_history.clear()
                self.prev_focus_point = None
                self.is_static_scene = False

                # Print scene transition
                print(f"\nTransitioning to scene {current_scene_idx}")

            # Buffer frames for scene analysis
            if len(buffer_frames) < scene_buffer_size:
                buffer_frames.append(frame.copy())

                # If we have enough frames, analyze scene content
                if len(buffer_frames) == scene_buffer_size:
                    scene_analysis = self.scene_analyzer.analyze_scene_frames(buffer_frames)
                    is_people_scene = scene_analysis["content_type"] == "people"

                    # Skip scenes without people if requested
                    if not is_people_scene:
                        print(f"Scene {current_scene_idx} has no people, marking for potential removal")
                        self.scenes_without_people.append(current_scene_idx)

            # Store previous frame for motion detection
            self.prev_frame = frame.copy()

            # Skip processing for scenes without people if desired
            # This is where you'd implement your second objective
            if not is_people_scene:
                # Option 1: Skip frame entirely (will make video shorter)
                # continue

                # Option 2: Use center crop for scenes without people
                focus_point = (width // 2, height // 2)
                is_new_shot = False
            else:
                # Get focus point with enhanced stability
                focus_point = self._get_weighted_focus_point(frame, is_new_shot)

            # Use virtual camera to smooth movement and reduce jitter
            camera_center = self.virtual_camera.update(focus_point, force_immediate=is_new_shot)

            # Apply smooth cropping
            cropped_frame, crop_data = self.smooth_cropper.apply_smooth_crop(
                frame, camera_center, force_update=is_new_shot,
                content_type="people" if is_people_scene else "other",
                new_shot=is_new_shot
            )

            # Reset new shot flag
            if is_new_shot:
                is_new_shot = False

            # Apply long-term stabilization occasionally during stable scenes
            if current_frame_idx % 30 == 0 and not is_new_shot:
                self.smooth_cropper.long_term_stabilization()

            # Resize to target output resolution if needed
            if cropped_frame.shape[1] != output_width or cropped_frame.shape[0] != output_height:
                cropped_frame = cv2.resize(cropped_frame, (output_width, output_height))

            # Store focus point data
            self.focus_points_data.append({
                "frame": current_frame_idx,
                "scene": current_scene_idx,
                "timestamp": current_frame_idx / fps,
                "focus_point": {"x": camera_center[0], "y": camera_center[1]},
                "crop": {
                    "left": crop_data["left"],
                    "right": crop_data["right"],
                    "top": crop_data["top"],
                    "bottom": crop_data["bottom"]
                }
            })

            # Write frame
            out.write(cropped_frame)

        # Release resources
        cap.release()
        out.release()
        progress_bar.close()

        # Convert to H.264 with appropriate bitrate
        #output_bitrate = "5M"  # Adjust as needed for quality
        #self._convert_to_mp4(temp_output_path, output_path, output_bitrate)

        # Combine the processed video with the original audio using FFmpeg
        self.combine_video_with_audio(input_path, temp_output_path, output_path)

        # Clean up temp file
        if os.path.exists(temp_output_path):
            os.remove(temp_output_path)

        # Save focus point data if requested
        if output_data_path:
            with open(output_data_path, 'w') as f:
                json.dump({
                    "focus_points": self.focus_points_data,
                    "scenes_without_people": self.scenes_without_people
                }, f, indent=2)

        print(f"Processing complete. Output saved to {output_path}")
        if output_data_path:
            print(f"Focus point data saved to {output_data_path}")

        return output_data_path

    def _convert_to_mp4(self, input_path, output_path, bitrate="5M"):
        """Convert video to mp4 with specified bitrate using ffmpeg"""
        cmd = [
            "ffmpeg", "-i", input_path,
            "-c:v", "libx264", "-preset", "slow",
            "-b:v", bitrate, "-maxrate", bitrate, "-bufsize", bitrate,
            "-pix_fmt", "yuv420p", "-y", output_path
        ]
        subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)


    def combine_video_with_audio(self, original_video, processed_video, output_path):
        """Combine the processed video with the original audio using FFmpeg"""
        try:
            print("Adding audio track to the processed video...")
            # Command to extract audio from original video and combine with new video
            ffmpeg_cmd = [
                'ffmpeg',
                '-i', processed_video,  # Input processed video (no audio)
                '-i', original_video,   # Input original video (for audio)
                '-c:v', 'copy',         # Copy video stream without re-encoding
                '-c:a', 'aac',          # Use AAC codec for audio
                '-map', '0:v:0',        # Use video from first input
                '-map', '1:a:0',        # Use audio from second input
                '-shortest',            # Finish encoding when the shortest input stream ends
                '-y',                   # Overwrite output file if it exists
                output_path
            ]

            # Run the FFmpeg command
            subprocess.run(ffmpeg_cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            print(f"Successfully added audio to: {output_path}")

        except subprocess.CalledProcessError as e:
            print(f"Error adding audio: {e}")
            print(f"FFmpeg stderr: {e.stderr.decode() if e.stderr else 'No error output'}")
            # If FFmpeg fails, just use the video without audio
            import shutil
            shutil.copy(processed_video, output_path)
            print(f"Using video without audio: {output_path}")
        except Exception as e:
            print(f"Unexpected error adding audio: {e}")
            # If another error occurs, just use the video without audio
            import shutil
            shutil.copy(processed_video, output_path)
            print(f"Using video without audio: {output_path}")


    def process_batch(self, input_dir, output_dir, threads=1):
        """Process multiple videos in batch mode with optional multithreading"""
        os.makedirs(output_dir, exist_ok=True)

        video_files = [f for f in os.listdir(input_dir)
                      if f.lower().endswith(('.mp4', '.avi', '.mov', '.mkv'))]

        print(f"Found {len(video_files)} videos to process")

        if threads > 1:
            # TODO: Implement parallel processing if needed
            pass
        else:
            for video_file in video_files:
                input_path = os.path.join(input_dir, video_file)
                output_name = f"{os.path.splitext(video_file)[0]}_vertical.mp4"
                output_path = os.path.join(output_dir, output_name)
                output_data_path = os.path.join(output_dir, f"{os.path.splitext(video_file)[0]}_data.json")

                print(f"Processing {video_file}...")
                self.process_video(input_path, output_path, output_data_path)


# Example usage
if __name__ == "__main__":
    processor = ImprovedH2VProcessor()

    input_video = "input_video.mp4"  # Replace with your video path
    output_video = "output_vertical_video.mp4"
    # Export focus points data as JSON
    #tracking_data_path = os.path.splitext(output_video)[0] + "_tracking.json"
    tracking_data = 'output_vertical_video_tracking.json'
    focus_points_tracking_data = processor.process_video(input_video, output_video, tracking_data)
    #print(f"Processing complete. Vertical video saved to: {video_path}")
    print(f"Tracking data saved to: {focus_points_tracking_data }")

ERROR:pyscenedetect:VideoManager is deprecated and will be removed.
INFO:pyscenedetect:Loaded 1 video, framerate: 23.976 FPS, resolution: 1920 x 1080
INFO:pyscenedetect:Detecting scenes...


Detected 150 scenes
Processing video...


  0%|          | 29/11231 [00:04<28:41,  6.51it/s]  

Scene 0 has no people, marking for potential removal

Transitioning to scene 1



  0%|          | 50/11231 [00:25<3:44:14,  1.20s/it]

Scene 1 has no people, marking for potential removal


  1%|          | 90/11231 [00:26<15:06, 12.29it/s]


Transitioning to scene 2


  1%|          | 122/11231 [00:28<10:44, 17.23it/s]


Transitioning to scene 3


  1%|▏         | 154/11231 [00:33<24:06,  7.66it/s]


Transitioning to scene 4


  2%|▏         | 176/11231 [00:36<27:17,  6.75it/s]


Transitioning to scene 5


  2%|▏         | 241/11231 [00:42<09:00, 20.32it/s]


Transitioning to scene 6


  3%|▎         | 331/11231 [00:51<15:45, 11.52it/s]


Transitioning to scene 7


  3%|▎         | 382/11231 [00:56<12:42, 14.22it/s]


Transitioning to scene 8


  4%|▍         | 463/11231 [01:00<04:40, 38.39it/s]


Transitioning to scene 9


  5%|▍         | 530/11231 [01:04<06:13, 28.67it/s]


Transitioning to scene 10


  5%|▍         | 554/11231 [01:08<40:51,  4.35it/s]  


Transitioning to scene 11


  5%|▌         | 581/11231 [01:31<5:07:26,  1.73s/it] 

Scene 11 has no people, marking for potential removal


  5%|▌         | 611/11231 [01:31<27:17,  6.49it/s]


Transitioning to scene 12


  6%|▌         | 694/11231 [01:39<15:26, 11.37it/s]


Transitioning to scene 13


  7%|▋         | 746/11231 [01:45<11:45, 14.85it/s]


Transitioning to scene 14


  7%|▋         | 792/11231 [01:50<08:17, 20.97it/s]


Transitioning to scene 15


  8%|▊         | 952/11231 [02:00<10:15, 16.70it/s]


Transitioning to scene 16


  9%|▉         | 1006/11231 [02:05<11:31, 14.79it/s]


Transitioning to scene 17


 10%|▉         | 1072/11231 [02:13<11:10, 15.15it/s]


Transitioning to scene 18


 11%|█         | 1220/11231 [02:25<11:20, 14.72it/s]


Transitioning to scene 19


 11%|█         | 1255/11231 [02:29<15:41, 10.60it/s]


Transitioning to scene 20


 11%|█▏        | 1271/11231 [02:31<15:54, 10.44it/s]


Transitioning to scene 21


 12%|█▏        | 1299/11231 [02:54<4:31:16,  1.64s/it] 

Scene 21 has no people, marking for potential removal


 12%|█▏        | 1327/11231 [02:54<25:55,  6.37it/s]


Transitioning to scene 22


 12%|█▏        | 1403/11231 [03:02<10:55, 14.98it/s]


Transitioning to scene 23


 13%|█▎        | 1451/11231 [03:07<11:33, 14.11it/s]


Transitioning to scene 24


 13%|█▎        | 1478/11231 [03:10<22:34,  7.20it/s]


Transitioning to scene 25


 13%|█▎        | 1507/11231 [03:12<15:55, 10.18it/s]


Transitioning to scene 26


 14%|█▍        | 1549/11231 [03:17<09:52, 16.34it/s]


Transitioning to scene 27


 14%|█▍        | 1585/11231 [03:20<12:38, 12.73it/s]


Transitioning to scene 28


 15%|█▍        | 1647/11231 [03:23<04:36, 34.63it/s]


Transitioning to scene 29


 15%|█▌        | 1697/11231 [03:29<10:59, 14.45it/s]


Transitioning to scene 30


 16%|█▌        | 1792/11231 [03:35<04:05, 38.48it/s]


Transitioning to scene 31


 17%|█▋        | 1880/11231 [03:41<13:19, 11.70it/s]


Transitioning to scene 32


 17%|█▋        | 1906/11231 [03:45<26:57,  5.76it/s]


Transitioning to scene 33


 17%|█▋        | 1950/11231 [03:51<12:58, 11.92it/s]


Transitioning to scene 34


 19%|█▉        | 2136/11231 [04:00<08:45, 17.32it/s]


Transitioning to scene 35


 20%|██        | 2270/11231 [04:13<06:56, 21.51it/s]


Transitioning to scene 36


 21%|██▏       | 2388/11231 [04:19<04:55, 29.98it/s]


Transitioning to scene 37


 22%|██▏       | 2455/11231 [04:27<15:52,  9.22it/s]


Transitioning to scene 38


 23%|██▎       | 2554/11231 [04:35<08:14, 17.55it/s]


Transitioning to scene 39


 25%|██▍       | 2756/11231 [04:55<11:23, 12.40it/s]


Transitioning to scene 40


 25%|██▌       | 2830/11231 [05:00<06:11, 22.61it/s]


Transitioning to scene 41


 26%|██▌       | 2883/11231 [05:05<06:23, 21.78it/s]


Transitioning to scene 42


 26%|██▋       | 2952/11231 [05:09<10:50, 12.74it/s]


Transitioning to scene 43


 28%|██▊       | 3101/11231 [05:18<06:25, 21.09it/s]


Transitioning to scene 44


 28%|██▊       | 3130/11231 [05:20<09:49, 13.75it/s]


Transitioning to scene 45


 29%|██▉       | 3233/11231 [05:26<04:19, 30.87it/s]


Transitioning to scene 46


 29%|██▉       | 3307/11231 [05:33<06:18, 20.93it/s]


Transitioning to scene 47


 30%|██▉       | 3358/11231 [05:37<04:48, 27.28it/s]


Transitioning to scene 48


 31%|███       | 3445/11231 [05:46<07:00, 18.53it/s]


Transitioning to scene 49


 31%|███       | 3477/11231 [05:49<10:02, 12.86it/s]


Transitioning to scene 50


 32%|███▏      | 3541/11231 [05:55<04:38, 27.65it/s]


Transitioning to scene 51


 32%|███▏      | 3570/11231 [05:57<10:05, 12.65it/s]


Transitioning to scene 52


 32%|███▏      | 3613/11231 [06:05<09:32, 13.31it/s]


Transitioning to scene 53


 33%|███▎      | 3692/11231 [06:12<08:31, 14.73it/s]


Transitioning to scene 54


 33%|███▎      | 3742/11231 [06:17<05:52, 21.24it/s]


Transitioning to scene 55


 34%|███▎      | 3778/11231 [06:21<08:43, 14.24it/s]


Transitioning to scene 56


 34%|███▍      | 3837/11231 [06:24<04:13, 29.15it/s]


Transitioning to scene 57


 35%|███▍      | 3884/11231 [06:29<09:47, 12.51it/s]


Transitioning to scene 58


 35%|███▌      | 3942/11231 [06:37<11:12, 10.83it/s]


Transitioning to scene 59


 36%|███▌      | 3997/11231 [06:44<08:13, 14.65it/s]


Transitioning to scene 60


 36%|███▌      | 4050/11231 [06:51<12:32,  9.55it/s]


Transitioning to scene 61


 38%|███▊      | 4220/11231 [07:05<07:34, 15.41it/s]


Transitioning to scene 62


 38%|███▊      | 4266/11231 [07:11<06:40, 17.38it/s]


Transitioning to scene 63


 39%|███▊      | 4332/11231 [07:18<07:34, 15.19it/s]


Transitioning to scene 64


 39%|███▉      | 4364/11231 [07:20<07:38, 14.96it/s]


Transitioning to scene 65


 39%|███▉      | 4408/11231 [07:28<09:48, 11.59it/s]


Transitioning to scene 66


 39%|███▉      | 4423/11231 [07:28<05:04, 22.36it/s]


Transitioning to scene 67


 40%|███▉      | 4443/11231 [07:30<12:00,  9.43it/s]


Transitioning to scene 68


 40%|████      | 4513/11231 [07:38<07:52, 14.22it/s]


Transitioning to scene 69


 41%|████      | 4599/11231 [07:48<09:10, 12.05it/s]


Transitioning to scene 70


 42%|████▏     | 4723/11231 [08:00<10:08, 10.70it/s]


Transitioning to scene 71


 43%|████▎     | 4781/11231 [08:09<07:28, 14.39it/s]


Transitioning to scene 72


 43%|████▎     | 4808/11231 [08:32<3:33:26,  1.99s/it] 

Scene 72 has no people, marking for potential removal

Transitioning to scene 73


 43%|████▎     | 4839/11231 [08:54<1:41:25,  1.05it/s]

Scene 73 has no people, marking for potential removal

Transitioning to scene 74


 43%|████▎     | 4879/11231 [08:58<21:28,  4.93it/s]


Transitioning to scene 75


 44%|████▎     | 4908/11231 [09:21<3:01:24,  1.72s/it] 

Scene 75 has no people, marking for potential removal


 44%|████▍     | 4958/11231 [09:22<06:13, 16.79it/s]


Transitioning to scene 76


 44%|████▍     | 4990/11231 [09:24<07:21, 14.14it/s]


Transitioning to scene 77


 45%|████▍     | 5018/11231 [09:50<3:14:51,  1.88s/it] 

Scene 77 has no people, marking for potential removal


 45%|████▌     | 5062/11231 [09:51<08:40, 11.86it/s]


Transitioning to scene 78


 45%|████▌     | 5097/11231 [09:54<08:21, 12.24it/s]


Transitioning to scene 79


 46%|████▌     | 5163/11231 [10:03<05:10, 19.52it/s]


Transitioning to scene 80


 47%|████▋     | 5237/11231 [10:09<11:34,  8.63it/s]


Transitioning to scene 81


 48%|████▊     | 5368/11231 [10:21<06:54, 14.13it/s]


Transitioning to scene 82


 48%|████▊     | 5397/11231 [10:24<11:16,  8.62it/s]


Transitioning to scene 83


 49%|████▉     | 5486/11231 [10:32<05:05, 18.78it/s]


Transitioning to scene 84


 49%|████▉     | 5530/11231 [10:36<04:35, 20.71it/s]


Transitioning to scene 85


 51%|█████     | 5713/11231 [10:51<02:55, 31.47it/s]


Transitioning to scene 86


 51%|█████▏    | 5768/11231 [10:55<07:28, 12.17it/s]


Transitioning to scene 87


 52%|█████▏    | 5803/11231 [10:58<06:16, 14.40it/s]


Transitioning to scene 88


 52%|█████▏    | 5823/11231 [11:00<13:33,  6.65it/s]


Transitioning to scene 89


 53%|█████▎    | 5905/11231 [11:08<04:03, 21.90it/s]


Transitioning to scene 90


 53%|█████▎    | 5933/11231 [11:10<06:37, 13.31it/s]


Transitioning to scene 91


 53%|█████▎    | 5991/11231 [11:17<04:29, 19.42it/s]


Transitioning to scene 92


 54%|█████▎    | 6030/11231 [11:20<03:55, 22.04it/s]


Transitioning to scene 93


 54%|█████▍    | 6087/11231 [11:24<05:05, 16.85it/s]


Transitioning to scene 94


 55%|█████▌    | 6180/11231 [11:34<05:35, 15.05it/s]


Transitioning to scene 95


 55%|█████▌    | 6229/11231 [11:39<05:35, 14.91it/s]


Transitioning to scene 96


 57%|█████▋    | 6347/11231 [11:50<04:12, 19.35it/s]


Transitioning to scene 97


 57%|█████▋    | 6389/11231 [11:53<04:28, 18.03it/s]


Transitioning to scene 98


 58%|█████▊    | 6519/11231 [12:05<03:53, 20.17it/s]


Transitioning to scene 99


 58%|█████▊    | 6540/11231 [12:07<11:51,  6.59it/s]


Transitioning to scene 100


 59%|█████▉    | 6651/11231 [12:17<05:33, 13.73it/s]


Transitioning to scene 101


 61%|██████    | 6805/11231 [12:31<05:11, 14.19it/s]


Transitioning to scene 102


 61%|██████▏   | 6880/11231 [12:36<01:54, 38.06it/s]


Transitioning to scene 103


 63%|██████▎   | 7042/11231 [12:47<05:19, 13.13it/s]


Transitioning to scene 104


 63%|██████▎   | 7093/11231 [12:50<05:29, 12.57it/s]


Transitioning to scene 105


 64%|██████▍   | 7161/11231 [12:58<03:47, 17.93it/s]


Transitioning to scene 106


 64%|██████▍   | 7227/11231 [13:01<02:01, 32.99it/s]


Transitioning to scene 107


 66%|██████▌   | 7423/11231 [13:08<01:36, 39.34it/s]


Transitioning to scene 108


 67%|██████▋   | 7517/11231 [13:12<02:29, 24.80it/s]


Transitioning to scene 109


 67%|██████▋   | 7573/11231 [13:15<01:57, 31.13it/s]


Transitioning to scene 110


 68%|██████▊   | 7634/11231 [13:18<01:45, 34.05it/s]


Transitioning to scene 111


 69%|██████▉   | 7783/11231 [13:23<01:30, 37.90it/s]


Transitioning to scene 112


 70%|███████   | 7913/11231 [13:29<01:22, 40.25it/s]


Transitioning to scene 113


 71%|███████   | 7944/11231 [13:31<03:23, 16.11it/s]


Transitioning to scene 114


 73%|███████▎  | 8158/11231 [13:44<01:33, 32.71it/s]


Transitioning to scene 115


 74%|███████▍  | 8291/11231 [13:49<01:11, 41.26it/s]


Transitioning to scene 116


 74%|███████▍  | 8335/11231 [13:52<02:31, 19.14it/s]


Transitioning to scene 117


 75%|███████▍  | 8375/11231 [13:55<02:23, 19.87it/s]


Transitioning to scene 118


 76%|███████▌  | 8483/11231 [14:04<02:34, 17.76it/s]


Transitioning to scene 119


 76%|███████▌  | 8558/11231 [14:09<01:25, 31.15it/s]


Transitioning to scene 120


 76%|███████▋  | 8591/11231 [14:11<02:32, 17.32it/s]


Transitioning to scene 121


 78%|███████▊  | 8763/11231 [14:17<01:30, 27.29it/s]


Transitioning to scene 122


 79%|███████▉  | 8878/11231 [14:22<01:02, 37.81it/s]


Transitioning to scene 123


 80%|████████  | 8994/11231 [14:26<00:57, 38.91it/s]


Transitioning to scene 124


 80%|████████  | 9017/11231 [14:28<03:57,  9.31it/s]


Transitioning to scene 125


 81%|████████  | 9087/11231 [14:38<01:49, 19.58it/s]


Transitioning to scene 126


 82%|████████▏ | 9224/11231 [14:44<02:37, 12.73it/s]


Transitioning to scene 127


 83%|████████▎ | 9349/11231 [14:57<03:47,  8.29it/s]


Transitioning to scene 128


 84%|████████▍ | 9430/11231 [15:05<01:23, 21.45it/s]


Transitioning to scene 129


 85%|████████▍ | 9512/11231 [15:09<01:05, 26.28it/s]


Transitioning to scene 130


 86%|████████▌ | 9653/11231 [15:22<02:02, 12.90it/s]


Transitioning to scene 131


 86%|████████▋ | 9711/11231 [15:25<00:47, 31.75it/s]


Transitioning to scene 132


 88%|████████▊ | 9858/11231 [15:30<00:34, 39.61it/s]


Transitioning to scene 133


 88%|████████▊ | 9916/11231 [15:33<00:36, 35.84it/s]


Transitioning to scene 134


 90%|████████▉ | 10059/11231 [15:46<00:57, 20.46it/s]


Transitioning to scene 135


 90%|█████████ | 10132/11231 [15:50<00:31, 35.00it/s]


Transitioning to scene 136


 91%|█████████ | 10171/11231 [15:53<00:49, 21.48it/s]


Transitioning to scene 137


 91%|█████████ | 10236/11231 [15:57<00:39, 24.91it/s]


Transitioning to scene 138


 92%|█████████▏| 10282/11231 [15:59<00:35, 26.61it/s]


Transitioning to scene 139


 93%|█████████▎| 10467/11231 [16:06<00:19, 38.69it/s]


Transitioning to scene 140


 94%|█████████▎| 10514/11231 [16:09<00:26, 26.92it/s]


Transitioning to scene 141


 95%|█████████▍| 10640/11231 [16:16<00:24, 23.83it/s]


Transitioning to scene 142


 97%|█████████▋| 10851/11231 [16:23<00:12, 30.31it/s]


Transitioning to scene 143


 98%|█████████▊| 10966/11231 [16:34<00:12, 20.58it/s]


Transitioning to scene 144


 98%|█████████▊| 11001/11231 [16:37<00:12, 18.78it/s]


Transitioning to scene 145


 98%|█████████▊| 11053/11231 [16:39<00:05, 30.30it/s]


Transitioning to scene 146


 99%|█████████▉| 11103/11231 [16:45<00:06, 18.44it/s]


Transitioning to scene 147


 99%|█████████▉| 11172/11231 [16:49<00:02, 28.69it/s]


Transitioning to scene 148


100%|█████████▉| 11203/11231 [16:51<00:02, 13.01it/s]


Transitioning to scene 149


100%|██████████| 11231/11231 [16:55<00:00, 11.05it/s]



Transitioning to scene 150
Adding audio track to the processed video...
Successfully added audio to: output_vertical_video.mp4
Processing complete. Output saved to output_vertical_video.mp4
Focus point data saved to output_vertical_video_tracking.json
Tracking data saved to: output_vertical_video_tracking.json


### **Play Portrait Video**

In [None]:
from IPython.display import HTML
from base64 import b64encode

# Replace 'your_video.mp4' with the actual path to your video file
mp4 = open('output_vertical_video.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)


Buffered data was truncated after reaching the output size limit.

### **Download Portrait Video and Tracking file**

In [44]:
import os
import cv2
import numpy as np
import scenedetect
from scenedetect import VideoManager, SceneManager
from scenedetect.detectors import ContentDetector
import mediapipe as mp
import json
import subprocess
from tqdm import tqdm
import collections
import shutil
from google.colab import files
from IPython.display import HTML
from base64 import b64encode


# Replace 'your_video.mp4' with the actual path to your video file
mp4 = open('output_vertical_video.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

# Download the video file
files.download('output_vertical_video.mp4')

# Assuming your tracking data is in a file named 'tracking_data.json'
# Replace 'tracking_data.json' with the actual file name if different
try:
    files.download('output_vertical_video_tracking.json')
except FileNotFoundError:
    print("tracking_data.json not found. Please make sure the file exists.")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### **Add Portrait Video to Google Drive Folder**

In [53]:
from google.colab import drive
drive.mount('/content/Mydrive')

# Replace 'your_folder_name' with the actual folder name in your Google Drive
# where you want to save the files.
folder_name = 'H2V-Pipeline-portrait-video'
output_folder = f'{folder_name}'

import os

# Create the folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Copy the files to the specified folder in Google Drive.
# Replace 'output_vertical_video.mp4' and 'output_vertical_video_tracking.json'
# with the correct file names, if necessary.
try:
    shutil.copy('output_vertical_video.mp4', output_folder)
    print(f"output_vertical_video.mp4 successfully copied to {output_folder}")
except FileNotFoundError:
    print("output_vertical_video.mp4 not found.")

try:
    shutil.copy('output_vertical_video_tracking.json', output_folder)
    print(f"output_vertical_video_tracking.json successfully copied to {output_folder}")
except FileNotFoundError:
    print("output_vertical_video_tracking.json not found.")


MessageError: Error: credential propagation was unsuccessful

In [50]:
pwd

'/content'