In [26]:
import torch
import torch.nn as nn
import torchvision
import torchvision.models as pretrained
from torchvision import transforms

import requests
from PIL import Image
import pandas as pd
import numpy as np
import imageio
import cv2
import os
from torch.utils.data import Dataset, DataLoader

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [4]:
IMG_SIZE = 64
BATCH_SIZE = 8
EPOCHS = 10

MAX_SEQ_LENGTH = 20
NUM_FEATURES = (IMG_SIZE**2)*3

In [5]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

train_df.sample(10)

Total videos for training: 594
Total videos for testing: 224


Unnamed: 0,video_name,tag
55,v_CricketShot_g15_c07.avi,CricketShot
405,v_ShavingBeard_g15_c04.avi,ShavingBeard
267,v_Punch_g12_c03.avi,Punch
474,v_ShavingBeard_g25_c04.avi,ShavingBeard
271,v_Punch_g12_c07.avi,Punch
129,v_PlayingCello_g09_c06.avi,PlayingCello
494,v_TennisSwing_g10_c05.avi,TennisSwing
269,v_Punch_g12_c05.avi,Punch
8,v_CricketShot_g09_c02.avi,CricketShot
413,v_ShavingBeard_g16_c05.avi,ShavingBeard


In [6]:
labels = train_df["tag"].unique()

In [9]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [11]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
              temp_frame_features[i, j, :] = batch[None,j,:].reshape(-1)
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


train_data, train_labels = prepare_all_videos(train_df, "train")
test_data, test_labels = prepare_all_videos(test_df, "test")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

Frame features in train set: (594, 20, 12288)
Frame masks in train set: (594, 20)


In [None]:
train_data = (torch.from_numpy(train_data[0]).float(), torch.from_numpy(train_data[1]).float())

In [24]:
test_data = (torch.from_numpy(test_data[0]).float(), torch.from_numpy(test_data[1]).float())

In [31]:
resnet50 = pretrained.resnet50(pretrained=True)

In [32]:
def get_embedding(module, input, output):
  return output

In [29]:
resnet50.avgpool.register_forward_hook(get_embedding)

<torch.utils.hooks.RemovableHandle at 0x7fe379d76340>

In [33]:
t = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[
                0.229, 0.224, 0.255]),
        ])

In [35]:
class VideoDataset(Dataset):
    def __init__(self, data, labels):
        super().__init__()
        self.data = data
        self.labels = labels
        self.embeddings_data = []
        # self.process()
    
    # def process(self):
    #     for i in range(len(self.data[0])):
    #         self.embeddings_data.append(resnet50(self.data[0][i].to(device)).cpu().detach().numpy())


In [36]:
data = VideoDataset(train_data, train_labels)

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [20, 12288]

In [None]:
print(len(data.data[0]))