# Classifying YouTube Videos for Humpback Whale Encounters - Keras CNN-RNN

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import pickle
import glob
import cv2
import os
import time

from sklearn import metrics
from sklearn.model_selection import KFold, StratifiedKFold
import seaborn as sns

In [3]:
#ngc workspace path (where we keep our data)
workspace_path = '/mount/data'

# Start WandB Session

In [4]:
import wandb

#start wandb session for metric logging
wandb.login() 

wandb.init(project="whale-classification-inception")

[34m[1mwandb[0m: Currently logged in as: [33mmicheller[0m ([33mepg[0m). Use [1m`wandb login --relogin`[0m to force relogin


# Set GPU Context

In [5]:
print("Num GPUs available: ", len(tf.config.list_physical_devices('GPU'))) #1 if we select GPU mode in Colab Notebook, 0 if running on local machine

Num GPUs available:  2


In [7]:
# gpus = tf.config.list_physical_devices('GPU')
gpus = tf.config.list_logical_devices('GPU')

for gpu in gpus:
    print(gpu.name)

/device:GPU:0
/device:GPU:1


# Inception V3 (CNN-RNN) 

## Hyperparameters

In [8]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 10

MAX_NUM_FRAMES = 461
NUM_FEATURES = 2048

461 frames of size 224 x 224 with RGB color channels

# Load Frames + Extract Features with CNN

In [9]:
from feature_extraction import load_frames, prepare_all_videos
from cnn import CNN

In [10]:
#create CNN Feature Extractor
ConvNet = CNN(IMG_SIZE)
feature_extractor = ConvNet.InceptionV3()
feature_extractor

<keras.engine.functional.Functional at 0x7fa8d05d2790>

In [12]:
#load dataset in
data = pd.read_csv(workspace_path + '/downloaded_videos.csv')
y = data.pop('relevant')
X = data

In [13]:
#begin keeping track of time to extract ALL frames using a single GPU
start = time.time()

with tf.device('/device:GPU:0'):
    (frame_features, frame_masks), labels = prepare_all_videos(X, y, MAX_NUM_FRAMES, NUM_FEATURES, feature_extractor)
    
stop = time.time()

print(f"Time to extract frames with single GPU: {stop - start}s")

video_0000.mp4


2022-07-11 21:43:32.420158: I tensorflow/stream_executor/cuda/cuda_dnn.cc:379] Loaded cuDNN version 8400


video_0134.mp4
video_0248.mp4
video_0357.mp4
Time to extract frames with single GPU: 18351.396564483643s


In [14]:
#took 5 hours to extract features from frames with the GPU context set above
(stop-start)/60/60

5.097610156801012

In [15]:
print('Frame features shape: ', frame_features.shape)
print('Frame masks shape: ', frame_masks.shape)
print('Number of Labels: ', len(labels))

Frame features shape:  (364, 461, 2048)
Frame masks shape:  (364, 461)
Number of Labels:  364


# Training RNN Sequence Model

In [16]:
from rnn import RNN

In [17]:
rnn_model = RNN()

In [18]:
#training RNN with 5 fold cross validation

skfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)
fold = 0

test_acc_per_fold       = dict()
test_loss_per_fold      = dict()
fold_train_test_indices = dict() #{'fold_model_name': [fold_train_index_list, fold_test_index_list]}

for train_index, test_index in skfold.split(X, y):
    
    print(f'Fold {fold} \n')
    
    #index data accordingly
    train_features, train_masks, train_labels = frame_features[train_index], frame_masks[train_index], np.array(labels)[train_index]
    test_features, test_masks, test_labels = frame_features[test_index], frame_masks[test_index], np.array(labels)[test_index]
    
    #reshape label arrays as horizontal arrays
    train_labels = np.reshape(train_labels, (train_labels.shape[0], 1))
    test_labels = np.reshape(test_labels, (test_labels.shape[0], 1))
    
    #create and compile model
    rnn_model.build_model(MAX_NUM_FRAMES, NUM_FEATURES)
    rnn_model.compile_model(loss="sparse_categorical_crossentropy", optimizer="adam", metrics="accuracy")
    
    #train and evaluate the model
    rnn_model.fit(train_features, train_masks, train_labels, f'rnn_model_{fold}')
    loss, accuracy = rnn_model.evaluate(test_features, test_masks, test_labels)
    
    #store the test accuracies and loss for each fold model
    test_acc_per_fold[fold]       = accuracy
    test_loss_per_fold[fold]      = loss
    fold_train_test_indices[fold] = [train_index, test_index]
    
    fold += 1
    

Fold 0 

Fold 1 

Fold 2 

Fold 3 

Fold 4 

