In [1]:
import os
import cv2
import h5py
import torch
import numpy as np
from tqdm import tqdm
from torchvision import models, transforms

In [2]:
from generate_summary import generate_summary
from evaluation_metrics import evaluate_summary
from layers.summarizer import PGL_SUM

In [3]:
# === Configuration ===
video_folder = './Videos/'  # Folder containing your input videos
h5_file_path = os.path.join("h5file_folder", "my_data.h5")  # Output .h5 files
dataset_prefix = 'custom_dataset_01'
fps = 15  # Target frame sampling rate

In [4]:
# os.makedirs(h5_file_path, exist_ok=True)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
# 1. Load pretrained GoogLeNet
googlenet = models.googlenet(pretrained=True)
googlenet = googlenet.to(device).eval()



In [7]:
# 2. Modify it to return features from 'avgpool' (1024-dim)
feature_extractor = torch.nn.Sequential(
    googlenet.conv1,
    googlenet.maxpool1,
    googlenet.conv2,
    googlenet.conv3,
    googlenet.maxpool2,
    googlenet.inception3a,
    googlenet.inception3b,
    googlenet.maxpool3,
    googlenet.inception4a,
    googlenet.inception4b,
    googlenet.inception4c,
    googlenet.inception4d,
    googlenet.inception4e,
    googlenet.maxpool4,
    googlenet.inception5a,
    googlenet.inception5b,
    googlenet.avgpool,  # shape: [1, 1024, 1, 1]
    torch.nn.Flatten(),  # shape: [1, 1024]
)

In [17]:
# 3. Image pre-processing transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],  # ImageNet means
        std=[0.229, 0.224, 0.225]    # ImageNet stds
    ),
])

In [19]:
from PIL import Image

In [21]:
# 4. Extract features from video
def extract_video_features(video_path, frame_rate=15):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frames = []
    picks = []
    count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if int(count % round(fps // frame_rate)) == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image = Image.fromarray(frame_rgb)
            input_tensor = transform(image).unsqueeze(0).to(device)

            with torch.no_grad():
                feature = feature_extractor(input_tensor)  # shape: [1, 1024]
                frames.append(feature.squeeze(0).cpu().numpy())  # shape: [1024]
                picks.append(count)
        count += 1

    cap.release()
    return frames, picks


In [23]:
video_path = "./Videos/1585497-hd_1920_1080_30fps.mp4"  # replace with your actual video path
frames, picks = extract_video_features(video_path)
frames_np = np.stack(frames)  # shape: [T, 1024]
frames_tensor = torch.tensor(frames_np, dtype=torch.float32).to(device)


In [12]:
# model = PGL_SUM(input_size=1024, output_size=1024, num_segments=4, heads=8,
#                                 fusion="add", pos_enc="absolute")

### Here i implement the PGL-trained model with best epoch

In [25]:
import torch
from os.path import join

# Assuming you already have PGL_SUM class defined/imported
trained_model = PGL_SUM(
    input_size=1024,
    output_size=1024,
    num_segments=4,
    heads=8,
    fusion="add",
    pos_enc="absolute"
)

In [35]:
# Load the model weights
model_path = "./Model/epoch-199.pkl"       # e.g., './models'
trained_model.load_state_dict(torch.load(model_path, map_location='cpu'))
trained_model.eval()

PGL_SUM(
  (attention): MultiAttention(
    (attention): SelfAttention(
      (Wk): ModuleList(
        (0-7): 8 x Linear(in_features=1024, out_features=128, bias=False)
      )
      (Wq): ModuleList(
        (0-7): 8 x Linear(in_features=1024, out_features=128, bias=False)
      )
      (Wv): ModuleList(
        (0-7): 8 x Linear(in_features=1024, out_features=128, bias=False)
      )
      (out): Linear(in_features=1024, out_features=1024, bias=False)
      (softmax): Softmax(dim=-1)
      (drop): Dropout(p=0.5, inplace=False)
    )
    (local_attention): ModuleList(
      (0-3): 4 x SelfAttention(
        (Wk): ModuleList(
          (0-3): 4 x Linear(in_features=1024, out_features=64, bias=False)
        )
        (Wq): ModuleList(
          (0-3): 4 x Linear(in_features=1024, out_features=64, bias=False)
        )
        (Wv): ModuleList(
          (0-3): 4 x Linear(in_features=1024, out_features=64, bias=False)
        )
        (out): Linear(in_features=256, out_features=1024, 

In [37]:
with torch.no_grad():
    scores, _ = trained_model(frames_tensor)
    print("Video score:", scores)


Video score: tensor([[0.2575, 0.2028, 0.3044,  ..., 0.1520, 0.1577, 0.1427]])


In [13]:
def get_change_points(picks, n_frames, segments=5):
    seg_len = len(picks) // segments
    change_points = []
    frame_per_seg = []
    for i in range(segments):
        start = i * seg_len
        end = (i + 1) * seg_len - 1 if i < segments - 1 else len(picks) - 1
        change_points.append([picks[start], picks[end]])
        frame_per_seg.append(picks[end] - picks[start] + 1)
    return np.array(change_points), np.array(frame_per_seg)

### This is the score i get earlier 

In [14]:
with torch.no_grad():
    scores, _ = model(frames_tensor)

In [15]:
scores

tensor([[0.3929, 0.3536, 0.7351,  ..., 0.4774, 0.4257, 0.3886]])

In [16]:
# H5 Creation
with h5py.File(h5_file_path, 'w') as f:
    for filename in os.listdir(video_folder):
        if not filename.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
            continue

        video_path = os.path.join(video_folder, filename)
        

        cap = cv2.VideoCapture(video_path)
        original_fps = cap.get(cv2.CAP_PROP_FPS)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        sampling_interval = int(round(original_fps / fps))

        frames = []
        picks = []
        count = 0

        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if count % sampling_interval == 0:
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                image = Image.fromarray(frame_rgb)
                input_tensor = transform(image).unsqueeze(0).to(device)
                with torch.no_grad():
                    feature_map = feature_extractor(input_tensor)
                    feature = feature_map.squeeze(0)
                    frames.append(feature.cpu().numpy())
                    picks.append(count)
            count += 1

        cap.release()
        features = np.vstack(frames).astype(np.float32)
        picks = np.array(picks)
        n_frames = total_frames
        n_steps = len(picks)

        # Segments
        change_points, n_frame_per_seg = get_change_points(picks, n_frames)
        
        # Generate user summary
        user_summary = generate_summary([change_points], [scores], [n_frames], [picks])[0] 
        
        # Scores from model
        with torch.no_grad():
            scores, _ = model(frames_tensor)  # [1, seq_len]
            #scores = scores.squeeze(0).cpu().numpy().tolist()
            #summary = generate_summary([change_points], [scores], [n_frames], [picks])[0]
            #f_score = evaluate_summary(summary, user_summary, eval_method)
            #video_fscores.append(f_score)

        

ValueError: could not broadcast input array from shape (1260,) into shape (2,)

In [None]:
        

        # Save everything
        f.create_dataset(video_name + '/features', data=features)
        f.create_dataset(video_name + '/gtscore', data=scores)
        f.create_dataset(video_name + '/user_summary', data=user_summary)
        f.create_dataset(video_name + '/change_points', data=change_points)
        f.create_dataset(video_name + '/n_frame_per_seg', data=n_frame_per_seg)
        f.create_dataset(video_name + '/n_frames', data=n_frames)
        f.create_dataset(video_name + '/picks', data=picks)
        f.create_dataset(video_name + '/n_steps', data=n_steps)
        f.create_dataset(video_name + '/gtsummary', data=user_summary)
        f.create_dataset(video_name + '/video_name', data=np.string_(video_name))

## Perivious Work 

In [None]:
# # === GoogLeNet Feature Extractor (before avgpool & FC) ===
# googlenet = models.googlenet(pretrained=True)
# feature_extractor = torch.nn.Sequential(*list(googlenet.children())[:-2])
# feature_extractor.eval()

In [None]:
# transform = transforms.Compose([
#     transforms.ToPILImage(),
#     transforms.Resize((224, 224)),
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406],
#                          std=[0.229, 0.224, 0.225])
# ])

In [None]:
# # === Video Processing Loop ===
# for filename in os.listdir(video_folder):
#     if not filename.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
#         continue

#     video_path = os.path.join(video_folder, filename)
#     video_name = os.path.splitext(filename)[0]
#     output_path = os.path.join(output_folder, f'{dataset_prefix}_{video_name}.h5')

#     print(f'Processing {video_name}...')
#     cap = cv2.VideoCapture(video_path)
#     original_fps = cap.get(cv2.CAP_PROP_FPS)
#     total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
#     sampling_interval = int(round(original_fps / fps))

#     frames = []
#     picks = []
#     count = 0

#     while True:
#         ret, frame = cap.read()
#         if not ret:
#             break
#         if count % sampling_interval == 0:
#             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#             input_tensor = transform(frame_rgb).unsqueeze(0)
#             with torch.no_grad():
#                 feature_map = feature_extractor(input_tensor)  # shape: [1, C, H, W]
#                 feature = feature_map.mean(dim=[2, 3]).squeeze(0) 
#                 frames.append(feature.numpy())
#                 picks.append(count)
#         count += 1

#     cap.release()
#     features = np.vstack(frames).astype(np.float32)
#     picks = np.array(picks)
#     n_frames = count
#     user_summary = np.zeros((1, len(picks)))  # Dummy summary

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# # Stack the list of 1D arrays into a 2D array: [seq_len, feature_dim]
# frames_np = np.stack(frames)  # shape: [seq_len, feature_dim]

# # Convert to tensor and add batch dimension: [1, seq_len, feature_dim]
# frames_tensor = torch.tensor(frames_np, dtype=torch.float32).unsqueeze(0).to(device)

In [None]:
# model = PGL_SUM(input_size=1024, output_size=1024, num_segments=4, heads=8,
#                                 fusion="add", pos_enc="absolute")

In [None]:
# with torch.no_grad():
#             scores, _ = model(frames_tensor)  # [1, seq_len]
#             scores = scores.squeeze(0).cpu().numpy().tolist()
#             summary = generate_summary([sb], [scores], [n_frames], [positions])[0]
#             f_score = evaluate_summary(summary, user_summary, eval_method)
#             video_fscores.append(f_score)

In [None]:
#     with h5py.File(output_path, 'w') as hdf:
#         hdf.create_dataset('features', data=features)
#         hdf.create_dataset('picks', data=picks)
#         hdf.create_dataset('n_frame', data=n_frames)
#         hdf.create_dataset('user_summary', data=user_summary)
#         hdf.create_dataset('video_name', data=video_name.encode())

#     print(f'Saved: {output_path}')

# print("✅ Done")