# Classifying YouTube Videos for Humpback Whale Encounters - Keras CNN-RNN

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import pickle
import glob
import cv2
import os
import time

from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
import seaborn as sns

In [3]:
#ngc workspace path (where we keep our data)
workspace_path = '/mount/data'

# Start WandB Session

In [62]:
import wandb

#start wandb session for metric logging
wandb.login() 

wandb.init(project="whale-classification-inception")

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize


[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmicheller[0m ([33mepg[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Set GPU Context

In [4]:
print("Num GPUs available: ", len(tf.config.list_physical_devices('GPU'))) #1 if we select GPU mode in Colab Notebook, 0 if running on local machine

Num GPUs available:  2


In [5]:
# gpus = tf.config.list_physical_devices('GPU')
gpus = tf.config.list_logical_devices('GPU')

for gpu in gpus:
    print(gpu.name)

/device:GPU:0
/device:GPU:1


2022-07-09 03:23:51.034265: I tensorflow/core/platform/cpu_feature_guard.cc:152] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-09 03:23:52.819185: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14649 MB memory:  -> device: 0, name: Tesla V100-SXM2-16GB-N, pci bus id: 0000:07:00.0, compute capability: 7.0
2022-07-09 03:23:52.832561: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 14649 MB memory:  -> device: 1, name: Tesla V100-SXM2-16GB-N, pci bus id: 0000:0a:00.0, compute capability: 7.0


# Inception V3 (CNN-RNN) 

## Hyperparameters

In [6]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 10

MAX_NUM_FRAMES = 461
NUM_FEATURES = 2048

461 frames of size 224 x 224 with RGB color channels

# Load Frames + Extract Features with CNN

In [7]:
from feature_extraction import load_frames, prepare_all_videos
from cnn import CNN

In [8]:
ConvNet = CNN(IMG_SIZE)
feature_extractor = ConvNet.InceptionV3()
feature_extractor

<keras.engine.functional.Functional at 0x7fd548e907c0>

In [9]:
data = pd.read_csv(workspace_path + '/downloaded_videos.csv')
y = data.pop('relevant')
X = data

In [24]:
#begin keeping track of time to extract frames
start = time.time()

#use single GPU to extract frames
with tf.device('/device:GPU:0'):
    (frame_features, frame_masks), labels = prepare_all_videos(X[0:5], y[0:5], MAX_NUM_FRAMES, NUM_FEATURES, feature_extractor)
    
stop = time.time()

print(f"Time to extract frames with single GPU: {stop - start}s")

Time to extract frames with single GPU: 369.6837012767792s


In [25]:
start = time.time()

#extract frames without gpu
(frame_features, frame_masks), labels = prepare_all_videos(X[0:5], y[0:5], MAX_NUM_FRAMES, NUM_FEATURES, feature_extractor)
    
stop = time.time()
print(f"Time to extract frames without GPU: {stop - start}s")

Time to extract frames without GPU: 354.47875595092773s


In [10]:
#begin keeping track of time to extract ALL frames using a single GPU
start = time.time()

with tf.device('/device:GPU:0'):
    (frame_features, frame_masks), labels = prepare_all_videos(X, y, MAX_NUM_FRAMES, NUM_FEATURES, feature_extractor)
    
stop = time.time()

print(f"Time to extract frames with single GPU: {stop - start}s")

video_0000.mp4


2022-07-09 03:24:34.155966: I tensorflow/stream_executor/cuda/cuda_dnn.cc:379] Loaded cuDNN version 8400


video_0134.mp4
video_0248.mp4
video_0357.mp4
Time to extract frames with single GPU: 46774.75740671158s


In [14]:
#took 12 hours to extract features from frames with the GPU context set above
(stop-start)/60/60

12.992988168530994

In [16]:
print('Frame features shape: ', frame_features.shape)
print('Frame masks shape: ', frame_masks.shape)
print('Number of Labels: ', len(labels))

Frame features shape:  (364, 461, 2048)
Frame masks shape:  (364, 461)
Number of Labels:  364


# Training RNN Sequence Model

In [48]:
from rnn import RNN

In [49]:
rnn_model = RNN()

In [50]:
#training RNN with 5 fold cross validation

skfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)
fold = 0

test_acc_per_fold       = dict()
test_loss_per_fold      = dict()
fold_train_test_indices = dict() #{'fold_model_name': [fold_train_index_list, fold_test_index_list]}

for train_index, test_index in skfold.split(X, y):
    
    print(f'Fold {fold} \n')
    
    #index data accordingly
    train_features, train_masks, train_labels = frame_features[train_index], frame_masks[train_index], np.array(labels)[train_index]
    test_features, test_masks, test_labels = frame_features[test_index], frame_masks[test_index], np.array(labels)[test_index]
    
    #reshape label arrays as horizontal arrays
    train_labels = np.reshape(train_labels, (train_labels.shape[0], 1))
    test_labels = np.reshape(test_labels, (test_labels.shape[0], 1))
    
#     print(train_features.shape)
#     print(train_masks.shape)
#     print(train_labels.shape)
    
#     print(test_features.shape)
#     print(test_masks.shape)
#     print(test_labels.shape)
    
    #create and compile model
    rnn_model.build_model(MAX_NUM_FRAMES, NUM_FEATURES)
    rnn_model.compile_model(loss="sparse_categorical_crossentropy", optimizer="adam", metrics="accuracy")
    
    #train and evaluate the model
    rnn_model.fit(train_features, train_masks, train_labels, f'rnn_model_{fold}')
    loss, accuracy = rnn_model.evaluate(test_features, test_masks, test_labels)
    
    #store the test accuracies and loss for each fold model
    test_acc_per_fold[fold]       = accuracy
    test_loss_per_fold[fold]      = loss
    fold_train_test_indices[fold] = [train_index, test_index]
    
    fold += 1
    

Fold 0 

Fold 1 

Fold 2 

Fold 3 

Fold 4 



In [51]:
test_acc_per_fold

{0: 0.7123287916183472,
 1: 0.7397260069847107,
 2: 0.7534246444702148,
 3: 0.7534246444702148,
 4: 0.8194444179534912}

In [52]:
test_loss_per_fold

{0: 0.603164792060852,
 1: 0.6413319706916809,
 2: 0.5668148994445801,
 3: 0.5481430292129517,
 4: 0.5037070512771606}