In [37]:
%pip install git+https://github.com/matthias-k/DeepGaze.git
%pip install ego4d

import matplotlib.pyplot as plt
import numpy as np
import scipy
import torch
import cv2
from scipy.ndimage import zoom
from scipy.special import logsumexp

import deepgaze_pytorch

from google.colab import drive

import requests
import os

Collecting git+https://github.com/matthias-k/DeepGaze.git
  Cloning https://github.com/matthias-k/DeepGaze.git to /tmp/pip-req-build-d40abbec
  Running command git clone --filter=blob:none --quiet https://github.com/matthias-k/DeepGaze.git /tmp/pip-req-build-d40abbec
  Resolved https://github.com/matthias-k/DeepGaze.git to commit 874f12e1ee519860f49860638cf7f6375956d45a
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [38]:
DEVICE = 'cuda'
model = deepgaze_pytorch.DeepGazeIII(pretrained=True).to(DEVICE)
model.eval()

Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.6.0


DeepGazeIII(
  (features): FeatureExtractor(
    (features): RGBDenseNet201(
      (0): Normalizer()
      (1): DenseNet(
        (features): Sequential(
          (conv0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
          (norm0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu0): ReLU(inplace=True)
          (pool0): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
          (denseblock1): _DenseBlock(
            (denselayer1): _DenseLayer(
              (norm1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (relu1): ReLU(inplace=True)
              (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
              (norm2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (relu2): ReLU(inplace=True)
              (conv2): Conv2d(128, 32, kernel_size=(3, 3), strid

In [44]:
drive.mount('/content/drive')
# make sure you have this mp4 downloaded to the correct place in your drive
video_path = '/content/drive/MyDrive/8301a3fc-aac5-466d-bb02-d0ac7a81ccc6.mp4'

if not os.path.exists('centerbias_mit1003.npy'):
    !wget https://github.com/matthias-k/DeepGaze/releases/download/v1.0.0/centerbias_mit1003.npy
centerbias_template = np.load('centerbias_mit1003.npy')

def get_resized_centerbias(h, w):
    resized_cb = zoom(centerbias_template, (h / centerbias_template.shape[0],
                                              w / centerbias_template.shape[1]),
                      order=0, mode='nearest')
    resized_cb -= logsumexp(resized_cb)
    return resized_cb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    print("Error: Could not open video.")
    raise SystemExit

width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps    = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

print(f"Video resolution: {width}x{height}, FPS: {fps}, Frames: {frame_count}")

Video resolution: 1440x1080, FPS: 30.0, Frames: 20693


In [42]:
sample_interval = 200

sampled_frames = []
frame_indices = []

for frame_idx in range(5000):
    ret, frame = cap.read()
    if frame_idx % sample_interval == 0:
        sampled_frames.append(frame)
        frame_indices.append(frame_idx)

cap.release()
print("Number of sampled frames:", len(sampled_frames))

Number of sampled frames: 25


In [67]:
from skimage.feature import peak_local_max
def get_top_n_fixations(saliency_map, n=4, min_distance=10):
    # Find local peaks in the saliency map
    # The peaks will be returned as (row, col) coordinates
    coordinates = peak_local_max(saliency_map, min_distance=min_distance, num_peaks=n)
    # If less than n peaks are found, you might want to pad or handle that case.
    # Convert row/col to x,y where x corresponds to column and y to row:
    fixation_x = coordinates[:, 1]  # columns as x
    fixation_y = coordinates[:, 0]  # rows as y
    return fixation_x, fixation_y

In [73]:
curr_fixations_x = np.array([width // 2, width // 4, width * 3 // 4, width // 2])
curr_fixations_y = np.array([height // 2, height // 4, height * 3 // 4, height // 2])

for idx, frame in enumerate(sampled_frames):

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB).astype(np.float32)
    height, width = frame_rgb.shape[:2]

    # Create image tensor: shape [1, 3, H, W]
    frame_tensor = torch.tensor(frame_rgb.transpose(2, 0, 1))[None].to(DEVICE)

    # Resize centerbias to match current frame dimensions
    cb = get_resized_centerbias(height, width)
    # Create the tensor with an added batch dimension: shape [1, H, W]
    cb_tensor = torch.tensor(cb)[None].to(DEVICE)
    x_hist_tensor = torch.tensor([curr_fixations_x[model.included_fixations]]).to(DEVICE)
    y_hist_tensor = torch.tensor([curr_fixations_y[model.included_fixations]]).to(DEVICE)

    # Get the log density prediction from the model
    with torch.no_grad(): log_density_prediction = model(frame_tensor, cb_tensor, x_hist_tensor, y_hist_tensor)

    # Convert prediction to numpy array
    log_density_map = log_density_prediction.detach().cpu().numpy()[0, 0]
    curr_fixations_x, curr_fixations_y = get_top_n_fixations(log_density_map)

    # Visualize the results
    plt.figure(figsize=(8, 4))

    plt.subplot(1, 2, 1)
    plt.imshow(frame_rgb.astype(np.uint8))
    plt.title(f"Frame {frame_indices[idx]}")  # using original frame index if available
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.imshow(log_density_map, cmap='jet')
    plt.title("Log Density Saliency")
    plt.axis('off')

    plt.show()

Output hidden; open in https://colab.research.google.com to view.