In [1]:
# Cell 1
import torch
from torchvision import transforms
from PIL import Image
import numpy as np
from model import DSN
import torch.nn as nn
import cv2
import time
import os


In [2]:
# Function to extract frames at a specified frame rate and append paths to a list
def extract_frames(video_path, output_folder, frame_rate=2):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print("Error: Could not open video file.")
        return

    frame_width = int(cap.get(3))  # Get the width of the frames
    frame_height = int(cap.get(4))  # Get the height of the frames

    # Define the codec and create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # You can change the codec as needed
    output_path = os.path.join(output_folder, "output_video.mp4")
    out = cv2.VideoWriter(output_path, fourcc, frame_rate, (frame_width, frame_height))

    start_time = time.time()
    frame_count = 0
    frames = []  # List to store frame paths

    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            break

        elapsed_time = time.time() - start_time
        if elapsed_time >= 1.0 / frame_rate:
            out.write(frame)
            frame_count += 1
            start_time = time.time()

            # Save the frame as an image file
            frame_filename = f"frame_{frame_count:04d}.png"
            frame_path = os.path.join(output_folder, frame_filename)
            cv2.imwrite(frame_path, frame)
            frames.append(frame_path)

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    print(f"Frames extracted: {frame_count}")
    print(f"Frames per second: {frame_rate}")
    print(f"Output video saved to: {output_path}")

    return frames


# Example usage
video_path = "./video/IronMan.mp4"
output_folder = "./frames"
frames = extract_frames(video_path, output_folder, frame_rate=2)

# Now 'extracted_frame_paths' contains a list of file paths for the extracted frames
print("Extracted frame paths:", frames)


Frames extracted: 147
Frames per second: 2
Output video saved to: ./frames\output_video.mp4
Extracted frame paths: ['./frames\\frame_0001.png', './frames\\frame_0002.png', './frames\\frame_0003.png', './frames\\frame_0004.png', './frames\\frame_0005.png', './frames\\frame_0006.png', './frames\\frame_0007.png', './frames\\frame_0008.png', './frames\\frame_0009.png', './frames\\frame_0010.png', './frames\\frame_0011.png', './frames\\frame_0012.png', './frames\\frame_0013.png', './frames\\frame_0014.png', './frames\\frame_0015.png', './frames\\frame_0016.png', './frames\\frame_0017.png', './frames\\frame_0018.png', './frames\\frame_0019.png', './frames\\frame_0020.png', './frames\\frame_0021.png', './frames\\frame_0022.png', './frames\\frame_0023.png', './frames\\frame_0024.png', './frames\\frame_0025.png', './frames\\frame_0026.png', './frames\\frame_0027.png', './frames\\frame_0028.png', './frames\\frame_0029.png', './frames\\frame_0030.png', './frames\\frame_0031.png', './frames\\frame

In [3]:
# Cell 2
def get_features(frames, gpu=True, batch_size=1):
    # Load pre-trained GoogLeNet model
    googlenet = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', weights='GoogLeNet_Weights.DEFAULT')

    # Remove the classification layer (last layer) to obtain features
    googlenet = torch.nn.Sequential(*(list(googlenet.children())[:-1]))

    # Set the model to evaluation mode
    googlenet.eval()

    # Initialize a list to store the features
    features = []

    # Image preprocessing pipeline
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Iterate through frames
    for frame_path in frames:
        # Load and preprocess the frame
        input_image = Image.open(frame_path)
        input_tensor = preprocess(input_image)
        input_batch = input_tensor.unsqueeze(0)  # Add batch dimension

        # Move the input and model to GPU if available
        if gpu:
            input_batch = input_batch.to('cuda')
            googlenet.to('cuda')

        # Perform feature extraction
        with torch.no_grad():
            output = googlenet(input_batch)

        # Append the features to the list
        features.append(output.squeeze().cpu().numpy())

    # Convert the list of features to a NumPy array
    features = np.array(features)

    return features.astype(np.float32)



In [4]:
# Cell 3
def _get_probs(features, gpu=True, mode=0):
    model_cache_key = "keyframes_rl_model_cache_" + str(mode)

    if mode == 1:
        model_path = "pretrained_model/model_1.pth.tar"
    else:
        model_path = "pretrained_model/model_0.pth.tar"
    model = DSN(in_dim=1024, hid_dim=256, num_layers=1, cell="lstm")
    if gpu:
        checkpoint = torch.load(model_path)
    else:
        checkpoint = torch.load(model_path, map_location='cpu')
    model.load_state_dict(checkpoint)
    if gpu:
        model = nn.DataParallel(model).cuda()
    model.eval()

    seq = torch.from_numpy(features).unsqueeze(0)
    if gpu: seq = seq.cuda()
    probs = model(seq)
    probs = probs.data.cpu().squeeze().numpy()
    return probs




In [5]:
print(_get_features(frames))
print(_get_probs(_get_features(frames)))

features = _get_features(frames)
print(features.shape)
print(features[0].shape)
print(_get_probs(features).shape)

Using cache found in C:\Users\Reuben/.cache\torch\hub\pytorch_vision_v0.10.0


[[5.0952245e-04 1.8711932e-02 7.0364801e-03 ... 5.4625672e-04
  2.8798267e-01 1.9751182e-01]
 [1.0839297e-02 1.0916159e-02 6.6831982e-04 ... 4.6322305e-02
  4.0471101e-01 1.5894957e-01]
 [7.7791312e-03 5.4033078e-02 7.6056413e-02 ... 1.7353630e-01
  1.2400125e-01 1.4495048e-01]
 ...
 [2.8588656e-01 2.4446552e-01 6.7060399e-01 ... 1.4680484e-01
  4.4968671e-01 9.6843272e-02]
 [2.2305962e-01 2.5735629e-01 7.4045902e-01 ... 1.6381127e-01
  4.6176362e-01 5.7618324e-02]
 [6.8690598e-02 6.9386996e-02 0.0000000e+00 ... 6.2626995e-02
  3.4741578e-01 2.0790750e-02]]


Using cache found in C:\Users\Reuben/.cache\torch\hub\pytorch_vision_v0.10.0


[0.91974765 0.9454341  0.94769996 0.9487048  0.95064247 0.95125735
 0.9565242  0.95543385 0.96018016 0.9623291  0.95793533 0.96072626
 0.96209913 0.9584297  0.95706403 0.95564145 0.96358347 0.96281374
 0.9595215  0.96077126 0.9604719  0.96107    0.9631843  0.9648292
 0.9646958  0.96339834 0.95486397 0.9596439  0.9600251  0.94932085
 0.96235466 0.9643711  0.96237177 0.95918226 0.9627097  0.96227163
 0.9593105  0.96155834 0.9647484  0.964222   0.9642918  0.9636387
 0.9598568  0.9552381  0.95258725 0.95290464 0.9573713  0.95899355
 0.9637703  0.96517587 0.9582592  0.9601232  0.96382606 0.9655748
 0.9603574  0.954497   0.96217    0.9609844  0.9618536  0.96171314
 0.95540655 0.9560038  0.95973194 0.956878   0.95507306 0.95795316
 0.95752627 0.9582264  0.95794547 0.9604366  0.9658313  0.96419054
 0.9691056  0.97104025 0.9584021  0.96912795 0.970009   0.9699702
 0.96600413 0.9680854  0.96873647 0.9695687  0.9630137  0.9605995
 0.9634725  0.9625286  0.96550447 0.9658135  0.9610188  0.96480966


Using cache found in C:\Users\Reuben/.cache\torch\hub\pytorch_vision_v0.10.0


(147, 1024)
(1024,)
(147,)
