In [None]:
!pip install -q git+https://github.com/tensorflow/docs

In [None]:
!wget -q https://git.io/JGc31 -O ucf101_top5.tar.gz
!tar xf ucf101_top5.tar.gz

In [None]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

from tqdm import tqdm

## Hyper parameters

In [None]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 10

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

## Data preparation

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f'Total videos for training: {len(train_df)}')
print(f'Total videos for test: {len(test_df)}')

train_df.sample(5)

In [None]:
def crop_center_square(frame):
  y, x = frame.shape[0:2]
  min_dim = min(x, y)
  start_x = (x // 2) - (min_dim // 2)
  start_y = (y // 2) - (min_dim // 2)
  return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]

In [None]:
def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
  cap = cv2.VideoCapture(path)
  frames = []
  try:
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      frame = crop_center_square(frame)
      frame = cv2.resize(frame, resize)
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)
      if len(frames) == max_frames:
        break
  finally:
    cap.release()
  return np.array(frames)

In [None]:
# convert label of the videos to numerical values
label_processor = keras.layers.StringLookup(
  num_oov_indices=0, vocabulary=np.unique(train_df['tag'])
)
print(label_processor.get_vocabulary())

## Build the CNN feature extractor

In [None]:
def build_feature_extractor():
  feature_extractor = keras.applications.InceptionV3(
    weights='imagenet',
    include_top=False,
    pooling='avg',
    input_shape=(IMG_SIZE, IMG_SIZE, 3)
  )
  preprocess_input = keras.applications.inception_v3.preprocess_input

  inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
  preprocessed = preprocess_input(inputs)

  outputs = feature_extractor(preprocessed)
  return keras.Model(inputs, outputs, name='feature_extractor')

In [None]:
feature_extractor = build_feature_extractor()

## Prepare all videos

In [None]:
def prepare_all_videos(df, root_dir):
  num_samples = len(df)
  video_paths = df['video_name'].values.tolist()
  labels = df['tag'].values
  labels = label_processor(labels[..., None]).numpy()

  frame_masks = np.zeros((num_samples, MAX_SEQ_LENGTH), dtype='bool')
  frame_features = np.zeros(
      (num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype='float32'
  )

  for idx, path in enumerate(tqdm(video_paths)):
    frames = load_video(os.path.join(root_dir, path))
    frames = frames[None, ...] # add batch dimension

    # Initialize placeholders to store the masks and features of the current video.
    temp_frame_mask = np.zeros((1, MAX_SEQ_LENGTH), dtype='bool')
    temp_frame_features = np.zeros(
        (1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype='float32'
    )

    # Extract features from the frames of the current video.
    for i, batch in enumerate(frames):
      video_length = batch.shape[0]
      length = min(MAX_SEQ_LENGTH, video_length)
      for j in range(length):
        temp_frame_features[i, j, :] = feature_extractor.predict(
            batch[None, j, :]
        )
      temp_frame_mask[i, :length] = 1 # 1 = not masked (not padded), 0 = masked
    
    frame_features[idx,] = temp_frame_features.squeeze()
    frame_masks[idx,] = temp_frame_mask.squeeze()

  return (frame_features, frame_masks), labels

In [None]:
train_data, train_labels = prepare_all_videos(train_df, 'train')

In [None]:
test_data, test_labels = prepare_all_videos(test_df, 'test')

## Build the sequence model

In [None]:
def get_sequence_model():
  class_vocab = label_processor.get_vocabulary()

  frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
  mask_input = keras.Input((MAX_SEQ_LENGTH, ), dtype='bool')

  # Refer to the following tutorial to understand the significance of using `mask`:
  # https://keras.io/api/layers/recurrent_layers/gru/
  x = keras.layers.GRU(16, return_sequences=True)(
      frame_features_input, mask=mask_input
  )
  x = keras.layers.GRU(8)(x)
  x = keras.layers.Dropout(0.4)(x)
  x = keras.layers.Dense(8, activation='relu')(x)
  output = keras.layers.Dense(len(class_vocab), activation='softmax')(x)

  rnn_model = keras.Model([frame_features_input, mask_input], output)

  rnn_model.compile(
      loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']
  )
  return rnn_model

In [None]:
seq_model = get_sequence_model()

In [None]:
seq_model.summary()

## Training

In [None]:
def run_training(seq_model):
  filepath = './video_classifier/'
  checkpoint = keras.callbacks.ModelCheckpoint(
      filepath, save_weights_only=True, save_best_only=True, verbose=1
  )

  history = seq_model.fit(
      [train_data[0], train_data[1]],
      train_labels,
      validation_split=0.3,
      epochs=EPOCHS,
      callbacks=[checkpoint]
  )

In [None]:
run_training(seq_model)

In [None]:
seq_model.load_weights('./video_classifier/')

In [None]:
_, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)

In [None]:
print(f'Test accuracy: {round(accuracy * 100, 2)}%')

## Inference

In [None]:
def prepare_single_video(frames):
  frames = frames[None, ...]
  frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype='bool')
  frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype='float32')

  for i, batch in enumerate(frames):
    video_length = batch.shape[0]
    length = min(video_length, MAX_SEQ_LENGTH)
    for j in range(length):
      frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
    frame_mask[i, :length] = 1

  return frame_features, frame_mask

In [None]:
def sequence_prediction(path, seq_model):
  class_vocab = label_processor.get_vocabulary()
  frames = load_video(os.path.join('test', path))
  frame_features, frame_mask = prepare_single_video(frames)
  probs = seq_model.predict([frame_features, frame_mask])[0]

  for i in np.argsort(probs)[::-1]:
    print(f'  {class_vocab[i]}: {probs[i] * 100:5.2f}%')
  return frames

In [None]:
# This utility is for visualization.
# Referenced from:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def to_gif(images):
  converted_images = images.astype(np.uint8)
  imageio.mimsave('animation.gif', converted_images, fps=10)
  return embed.embed_file('animation.gif')

In [None]:
test_video = np.random.choice(test_df['video_name'].values.tolist())
print(f'Test video path: {test_video}')
test_frames = sequence_prediction(test_video, seq_model)
to_gif(test_frames)