In [1]:
import os
import cv2
import h5py
import torch
import numpy as np
from tqdm import tqdm
from torchvision import models, transforms

In [15]:
# === Configuration ===
video_folder = './Videos/'  # Folder containing your input videos
output_folder = './h5file_folder/'  # Output .h5 files
dataset_prefix = 'custom_dataset'
fps = 15  # Target frame sampling rate

In [17]:
os.makedirs(output_folder, exist_ok=True)

In [19]:
# === GoogLeNet Feature Extractor (before avgpool & FC) ===
googlenet = models.googlenet(pretrained=True)
feature_extractor = torch.nn.Sequential(*list(googlenet.children())[:-2])
feature_extractor.eval()



Sequential(
  (0): BasicConv2d(
    (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (2): BasicConv2d(
    (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (3): BasicConv2d(
    (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (4): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (5): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (bra

In [21]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [25]:
# === Video Processing Loop ===
for filename in os.listdir(video_folder):
    if not filename.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
        continue

    video_path = os.path.join(video_folder, filename)
    video_name = os.path.splitext(filename)[0]
    output_path = os.path.join(output_folder, f'{dataset_prefix}_{video_name}.h5')

    print(f'Processing {video_name}...')
    cap = cv2.VideoCapture(video_path)
    original_fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    sampling_interval = int(round(original_fps / fps))

    frames = []
    picks = []
    count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if count % sampling_interval == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            input_tensor = transform(frame_rgb).unsqueeze(0)
            with torch.no_grad():
                feature_map = feature_extractor(input_tensor)  # shape: [1, C, H, W]
                feature = feature_map.mean(dim=[2, 3]).squeeze(0) 
                frames.append(feature.numpy())
                picks.append(count)
        count += 1

    cap.release()
    features = np.vstack(frames).astype(np.float32)
    picks = np.array(picks)
    n_frames = count
    user_summary = np.zeros((1, len(picks)))  # Dummy summary

Processing 1585497-hd_1920_1080_30fps...


In [None]:
    with h5py.File(output_path, 'w') as hdf:
        hdf.create_dataset('features', data=features)
        hdf.create_dataset('picks', data=picks)
        hdf.create_dataset('n_frame', data=n_frames)
        hdf.create_dataset('user_summary', data=user_summary)
        hdf.create_dataset('video_name', data=video_name.encode())

    print(f'Saved: {output_path}')

print("✅ Done: All videos converted.")