# Classifying YouTube Videos for Humpback Whale Encounters - Keras CNN-RNN

In [3]:
%load_ext autoreload
%autoreload 2

In [2]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import pickle
import glob
import cv2
import os

from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
import seaborn as sns

import wandb

In [4]:
#ngc workspace path (where we keep our data)
workspace_path = '/mount/data'

In [None]:
#start wandb session for metric logging
wandb.login() 

wandb.init(project="whale-classification-inception")

# Inception V3 (CNN-RNN) 

## Hyperparameters

In [3]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 10

MAX_NUM_FRAMES = 500
NUM_FEATURES = 2048

In [11]:
#read all frames in for 1 video from workspace 'frames' directory
frames = []
for i in range(461):
    if i % 100 == 0:
        print(i)
    
    #read in .jpg file as array for video clip 0000
    img = cv2.imread(workspace_path + f'/frames/clip_0000_frame_{i}.jpg')
    frames.append(img)

frames = np.array(frames)
frames.shape

(461, 224, 224, 3)

461 frames of size 224 x 224 with RGB color channels

In [None]:
def load_frames(video_title, max_frames):
    '''read all frames in for 1 video from workspace 'frames' directory'''

    #get number associated with clip to retrieve respective frames
    clip_number = video_title.split('_')[2].split('.')[0]

    #create list to store each frame
    frames = []

    for i in range(max_frames):

        #read in .jpg file as array for video clip 
        img = cv2.imread(workspace_path + f'/frames/clip_{clip_number}_frame_{i}.jpg')
        frames.append(img)

    #put list of frames in numpy format
    frames = np.array(frames)

    #return frames with an extra batch dimension
    return frames[None, ...]

def prepare_all_videos(X, y, max_frames, num_features, feature_extractor):

    num_samples = len(X)
    videos = list(X['renamed_title'].values)

    # `frame masks` and `frame_features are what we will feed to our sequence model
    frame_masks = np.zeros(shape=(num_samples, max_frames), dtype="bool")
    frame_features = np.zeros(shape=(num_samples, max_frames, num_features) , dtype="float32")

    #for each video
    for index, video_title in enumerate(videos):

        #Gather all the video's frames and add a batch dimension (frames has shape frames[None, ...])
        frames = load_frames(video_title, max_frames)

        #initialize placeholders to store the masks and features of the current video
        temp_frame_mask = np.zeros(shape=(1, max_frames), dtype="bool")  
        temp_frame_features = np.zeros(shape=(1, max_frames, num_features), dtype="float32")

        #extract features from the frames of the current video
        for i, batch in enumerate(frames):

            for j in range(max_frames):
                temp_frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])

            # 1 = not masked, 0 = masked
            temp_frame_mask[i, :max_frames] = 1 

        frame_features[index, ] = temp_frame_features.squeeze()
        frame_masks[index, ] = temp_frame_mask.squeeze()


    labels = y['relevant'].astype(int)
    return (frame_features, frame_masks), labels