- Use ours Crop Data
    - use Yolo/OpenCV/…… to detect people
    - use post-processing to keep crop area w/h > 0.6
    - Modify the transformation code (class CenterCrop(RandomCrop)) in the official mmaction
        - 'mmaction2/mmaction/datasets/transforms/processing.py'
        - origin official class have annotated
        
- The following is a simple sample code, and our cropped data is in the attachment.

- **if you don't crop it (bellow pipeline), the effect is comparable**
    | Model | Test F1_mean (TrainVal) |Test F1_mean (Train) | Val F1_mean | Crop |
    | :-: | :-: | :-: | :-: | :-: | 
    | VideoMAE-Base-K710-Frame-Assisting-I3D-Head |-| 71.11 | 71.87 | No |
    | VideoMAE-Base-K710-Frame-Assisting-I3D-Head |72.81| 71.14 | 72.33 | Yes|
    
```python
train_pipeline = [
    dict(type='DecordInit', **file_client_args),
    dict(type='UniformSample', clip_len=num_frames, num_clips=1),
    # dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=1),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='RandomResizedCrop', area_range=(0.3, 1.0)),
    dict(type='Resize', scale=(224, 224), keep_ratio=False),
    dict(type='ColorJitter'),
    dict(type='RandomErasing', max_area_ratio=0.2),
    dict(type='Flip', flip_ratio=0.5),
    dict(type='FormatShape', input_format='NCTHW'),
    dict(type='PackActionInputs')
]
val_pipeline = [
    dict(type='DecordInit', **file_client_args),
    dict(type='UniformSample', clip_len=num_frames, num_clips=2, test_mode=True),
    # dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=2, test_mode=True),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(-1, 256)),
    dict(type='CenterCrop', crop_size=224),
    dict(type='FormatShape', input_format='NCTHW'),
    dict(type='PackActionInputs')
]
test_pipeline = [
    dict(type='DecordInit', **file_client_args),
    dict(type='UniformSample', clip_len=num_frames, num_clips=2, test_mode=True),
    # dict(type='SampleFrames', clip_len=16, frame_interval=4, num_clips=2, test_mode=True),
    dict(type='DecordDecode'),
    # dict(type='Resize', scale=(-1, 256)),
    # dict(type='CenterCrop', crop_size=224),
    dict(type='Resize', scale=(-1, 224)),
    dict(type='ThreeCrop', crop_size=224),
    dict(type='FormatShape', input_format='NCTHW'),
    dict(type='PackActionInputs')
]
```

In [None]:
import cv2
import numpy as np
import os
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor

def find_red_object(frame):
    hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
    # lower_red1 = np.array([0, 70, 50])
    # upper_red1 = np.array([10, 255, 255])
    # lower_red2 = np.array([170, 70, 50])
    # upper_red2 = np.array([180, 255, 255])
    lower_red1 = np.array([0, 120, 50]) 
    upper_red1 = np.array([6, 255, 255]) 
    lower_red2 = np.array([174, 120, 50])  
    upper_red2 = np.array([180, 255, 255]) 
    mask1 = cv2.inRange(hsv, lower_red1, upper_red1)
    mask2 = cv2.inRange(hsv, lower_red2, upper_red2)
    mask = mask1 + mask2
    kernel = np.ones((5, 5), np.uint8)
    mask = cv2.dilate(mask, kernel, iterations=3)
    mask = cv2.erode(mask, kernel, iterations=2)
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    return contours

def determine_crop_region(video_path):
    cap = cv2.VideoCapture(video_path)
    min_x, min_y = np.inf, np.inf
    max_x, max_y = -np.inf, -np.inf
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        shape = frame.shape
        contours = find_red_object(frame)
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            min_x = min(min_x, x)
            min_y = min(min_y, y)
            max_x = max(max_x, x + w)
            max_y = max(max_y, y + h)
    cap.release()
    min_x = max(0, min_x - 60)
    min_y = max(0, min_y - 100)
    max_x = min(shape[1] - 1, max_x + 80)
    max_y = min(shape[0] - 1, max_y + 60)
    w = max_x - min_x
    h = max_y - min_y
    ratio = w / h
    if ratio < 0.6:
        padding = int((h * 0.6 - w) / 2)
        min_x = max(0, min_x - padding)
        max_x = min(shape[1] - 1, max_x + padding)
        w = max_x - min_x
    
    # print(ratio, w/h)
    return min_x, min_y, w, h

def crop_video_to_region(video_path, output_path, crop_region):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Failed to open video: {video_path}")
        return False
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (crop_region[2], crop_region[3]))

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        cropped_frame = frame[crop_region[1]:crop_region[1]+crop_region[3], crop_region[0]:crop_region[0]+crop_region[2]]
        # print(cropped_frame.shape, cropped_frame.shape[1]/cropped_frame.shape[0])
        out.write(cropped_frame)

    cap.release()
    out.release()
    
    # Verify the output video
    if os.path.exists(output_path):
        size = os.path.getsize(output_path)
        if size < 1000:  # Arbitrarily chosen minimum file size (in bytes)
            print(f"Warning: The output file {output_path} is unusually small ({size} bytes)")
            return False
        cap_check = cv2.VideoCapture(output_path)
        if not cap_check.isOpened() or cap_check.get(cv2.CAP_PROP_FRAME_COUNT) < frame_count * 0.9:  # Check if 90% of frames are present
            print(f"Error: Output video {output_path} is incomplete or corrupted.")
            cap_check.release()
            return False
        cap_check.release()
    return True

def process_video_file(video_file):
    video_path = os.path.join(input_dir, video_file)
    output_path = os.path.join(output_dir, video_file)
    crop_region = determine_crop_region(video_path)
    if not crop_video_to_region(video_path, output_path, crop_region):
        raise Exception("Crop video to region failed.")
        return f"Failed {video_file}"
    return [crop_region[2], crop_region[3]]

input_dir = 'autodl-tmp/train'
output_dir = 'autodl-tmp/all_clip'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

video_files = [f for f in os.listdir(input_dir) if f.endswith('.mp4')]

with ThreadPoolExecutor(max_workers=30) as executor:
    results = list(tqdm(executor.map(process_video_file, video_files), total=len(video_files)))

results = np.array(results)
print(results.mean(0))
print("All videos have been processed and saved to", output_dir)