# Classifying YouTube Videos for Humpback Whale Encounters - Keras CNN-RNN

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import pickle
import glob
import cv2
import os
import time

from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
import seaborn as sns

In [4]:
#ngc workspace path (where we keep our data)
workspace_path = '/mount/data'

# Start WandB Session

In [13]:
import wandb

#start wandb session for metric logging
wandb.login() 

wandb.init(project="whale-classification-inception")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmicheller[0m ([33mepg[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Set GPU Context

In [14]:
print("Num GPUs available: ", len(tf.config.list_physical_devices('GPU'))) #1 if we select GPU mode in Colab Notebook, 0 if running on local machine

Num GPUs available:  2


In [15]:
# gpus = tf.config.list_physical_devices('GPU')
gpus = tf.config.list_logical_devices('GPU')

for gpu in gpus:
    print(gpu.name)

/device:GPU:0
/device:GPU:1


2022-07-11 20:11:20.880017: I tensorflow/core/platform/cpu_feature_guard.cc:152] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-11 20:11:46.167770: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14649 MB memory:  -> device: 0, name: Tesla V100-SXM2-16GB-N, pci bus id: 0000:06:00.0, compute capability: 7.0
2022-07-11 20:11:46.425047: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 14649 MB memory:  -> device: 1, name: Tesla V100-SXM2-16GB-N, pci bus id: 0000:0b:00.0, compute capability: 7.0


# Inception V3 (CNN-RNN) 

## Hyperparameters

In [6]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 10

MAX_NUM_FRAMES = 461
NUM_FEATURES = 2048

461 frames of size 224 x 224 with RGB color channels

# Load Frames + Extract Features with CNN

In [7]:
from feature_extraction import load_frames, prepare_all_videos
from cnn import CNN

In [8]:
ConvNet = CNN(IMG_SIZE)
feature_extractor = ConvNet.InceptionV3()
feature_extractor

<keras.engine.functional.Functional at 0x7fd548e907c0>

In [9]:
#load dataset in
data = pd.read_csv(workspace_path + '/downloaded_videos.csv')
y = data.pop('relevant')
X = data

In [24]:
#begin keeping track of time to extract frames
start = time.time()

#use single GPU to extract frames
with tf.device('/device:GPU:0'):
    (frame_features, frame_masks), labels = prepare_all_videos(X[0:5], y[0:5], MAX_NUM_FRAMES, NUM_FEATURES, feature_extractor)
    
stop = time.time()

print(f"Time to extract frames with single GPU: {stop - start}s")

Time to extract frames with single GPU: 369.6837012767792s


In [25]:
start = time.time()

#extract frames without gpu
(frame_features, frame_masks), labels = prepare_all_videos(X[0:5], y[0:5], MAX_NUM_FRAMES, NUM_FEATURES, feature_extractor)
    
stop = time.time()
print(f"Time to extract frames without GPU: {stop - start}s")

Time to extract frames without GPU: 354.47875595092773s


In [10]:
#begin keeping track of time to extract ALL frames using a single GPU
start = time.time()

with tf.device('/device:GPU:0'):
    (frame_features, frame_masks), labels = prepare_all_videos(X, y, MAX_NUM_FRAMES, NUM_FEATURES, feature_extractor)
    
stop = time.time()

print(f"Time to extract frames with single GPU: {stop - start}s")

video_0000.mp4


2022-07-09 03:24:34.155966: I tensorflow/stream_executor/cuda/cuda_dnn.cc:379] Loaded cuDNN version 8400


video_0134.mp4
video_0248.mp4
video_0357.mp4
Time to extract frames with single GPU: 46774.75740671158s


In [14]:
#took 12 hours to extract features from frames with the GPU context set above
(stop-start)/60/60

12.992988168530994

In [16]:
print('Frame features shape: ', frame_features.shape)
print('Frame masks shape: ', frame_masks.shape)
print('Number of Labels: ', len(labels))

Frame features shape:  (364, 461, 2048)
Frame masks shape:  (364, 461)
Number of Labels:  364


# Training RNN Sequence Model

In [48]:
from rnn import RNN

In [49]:
rnn_model = RNN()

In [50]:
#training RNN with 5 fold cross validation

skfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)
fold = 0

test_acc_per_fold       = dict()
test_loss_per_fold      = dict()
fold_train_test_indices = dict() #{'fold_model_name': [fold_train_index_list, fold_test_index_list]}

for train_index, test_index in skfold.split(X, y):
    
    print(f'Fold {fold} \n')
    
    #index data accordingly
    train_features, train_masks, train_labels = frame_features[train_index], frame_masks[train_index], np.array(labels)[train_index]
    test_features, test_masks, test_labels = frame_features[test_index], frame_masks[test_index], np.array(labels)[test_index]
    
    #reshape label arrays as horizontal arrays
    train_labels = np.reshape(train_labels, (train_labels.shape[0], 1))
    test_labels = np.reshape(test_labels, (test_labels.shape[0], 1))
    
    #create and compile model
    rnn_model.build_model(MAX_NUM_FRAMES, NUM_FEATURES)
    rnn_model.compile_model(loss="sparse_categorical_crossentropy", optimizer="adam", metrics="accuracy")
    
    #train and evaluate the model
    rnn_model.fit(train_features, train_masks, train_labels, f'rnn_model_{fold}')
    loss, accuracy = rnn_model.evaluate(test_features, test_masks, test_labels)
    
    #store the test accuracies and loss for each fold model
    test_acc_per_fold[fold]       = accuracy
    test_loss_per_fold[fold]      = loss
    fold_train_test_indices[fold] = [train_index, test_index]
    
    fold += 1
    

Fold 0 

Fold 1 

Fold 2 

Fold 3 

Fold 4 



# Configuring Feature Extraction with Batches of Frames

In [20]:
def load_frames(video_title, max_frames):
    '''read all frames in for 1 video from workspace 'frames' directory'''

    #get number associated with clip to retrieve respective frames
    clip_number = video_title.split('_')[1].split('.')[0]

    #create list to store each frame
    frames = []

    for i in range(max_frames):

        #read in .jpg file as array for video clip 
        img = cv2.imread(workspace_path + f'/frames/clip_{clip_number}_frame_{i}.jpg')
        frames.append(img)

    #put list of frames in numpy format
    frames = np.array(frames)

    #return frames with an extra batch dimension
    return frames[None, ...]

In [17]:
#load dataset in
data = pd.read_csv(workspace_path + '/downloaded_videos.csv')
y = data.pop('relevant')
X = data

In [30]:
print('Batch of frames for video 0000 shape: ', batch.shape)
j=0

#this is just one image
batch[None, j, :].shape

Batch of frames for video 0000 shape:  (461, 224, 224, 3)


(1, 224, 224, 3)

In [25]:
max_frames = 461
num_features = 2048

num_samples = len(X)
videos = list(X['renamed_title'].values)

# `frame masks` and `frame_features are what we will feed to our sequence model
frame_masks = np.zeros(shape=(num_samples, max_frames), dtype="bool")
frame_features = np.zeros(shape=(num_samples, max_frames, num_features) , dtype="float32")

for index, video_title in enumerate(videos[0:1]):
    print(index, video_title)


    #Gather all the video's frames and add a batch dimension (frames has shape frames[None, ...])
    frames = load_frames(video_title, max_frames)
    print(frames.shape)

    #initialize placeholders to store the masks and features of the current video
    temp_frame_mask = np.zeros(shape=(1, max_frames), dtype="bool")  
    temp_frame_features = np.zeros(shape=(1, max_frames, num_features), dtype="float32")

    for i, batch in enumerate(frames):
        print('i:', i, 'batch.shape: ', batch.shape)
        #extract features from the frames of the current video
        for j in range(max_frames):
            print(f'j: {j}')
            curr_frame = batch[None, j, :]
            temp_frame_features[i, j, :] = feature_extractor.predict(curr_frame) #get frame features from current (single) frame

#         #create mask for current video 
#         #1 = not masked, 0 = masked
#         temp_frame_mask[i, :max_frames] = 1 

#     frame_features[index, ] = temp_frame_features.squeeze()
#     frame_masks[index, ] = temp_frame_mask.squeeze()



0 video_0000.mp4
(1, 461, 224, 224, 3)
i: 0 batch.shape:  (461, 224, 224, 3)
j: 0
j: 1
j: 2
j: 3
j: 4
j: 5
j: 6
j: 7
j: 8
j: 9
j: 10
j: 11
j: 12
j: 13
j: 14
j: 15
j: 16
j: 17
j: 18
j: 19
j: 20
j: 21
j: 22
j: 23
j: 24
j: 25
j: 26
j: 27
j: 28
j: 29
j: 30
j: 31
j: 32
j: 33
j: 34
j: 35
j: 36
j: 37
j: 38
j: 39
j: 40
j: 41
j: 42
j: 43
j: 44
j: 45
j: 46
j: 47
j: 48
j: 49
j: 50
j: 51
j: 52
j: 53
j: 54
j: 55
j: 56
j: 57
j: 58
j: 59
j: 60
j: 61
j: 62
j: 63
j: 64
j: 65
j: 66
j: 67
j: 68
j: 69
j: 70
j: 71
j: 72
j: 73
j: 74
j: 75
j: 76
j: 77
j: 78
j: 79
j: 80
j: 81
j: 82
j: 83
j: 84
j: 85
j: 86
j: 87
j: 88
j: 89
j: 90
j: 91
j: 92
j: 93
j: 94
j: 95
j: 96
j: 97
j: 98
j: 99
j: 100
j: 101
j: 102
j: 103
j: 104
j: 105
j: 106
j: 107
j: 108
j: 109
j: 110
j: 111
j: 112
j: 113
j: 114
j: 115
j: 116
j: 117
j: 118
j: 119
j: 120
j: 121
j: 122
j: 123
j: 124
j: 125
j: 126
j: 127
j: 128
j: 129
j: 130
j: 131
j: 132
j: 133
j: 134
j: 135
j: 136
j: 137
j: 138
j: 139
j: 140
j: 141
j: 142
j: 143
j: 144
j: 145
j: 146
j: 1