# Action Recognition with an Inflated 3D CNN


This notebook is inspired from [here](https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub). I chose to follow this tutorial as tensorflow hub has a wide range of available pre trained models ready for use, able to be customised and fine tuned.

The original tutorial used the i3d model trained on kinetics 400, I changed the model to use the i3d for kinetics 600, as I think that more labels would give a more confidence scoring system (closer to 100%) for predicted labels.

I also wanted to use the UCF101 dataset because it was more established and has more papers around it.

This exercise is run in Colab. Since no training is required, GPU need not be necessary

## Setup

In [None]:
!pip install -q imageio
!pip install -q opencv-python
!pip install -q git+https://github.com/tensorflow/docs

In [None]:
#@title Import the necessary modules
# TensorFlow and TF-Hub modules.
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow_docs.vis import embed

logging.set_verbosity(logging.ERROR)

# Some modules to help with reading the UCF101 dataset.
import random
import re
import os
import tempfile
import ssl
import cv2
import numpy as np

# Some modules to display an animation using imageio.
import imageio
from IPython import display

from urllib import request  # requires python3

In [None]:
#@title Helper functions for the UCF101 dataset

# Utilities to fetch videos from UCF101 dataset
UCF_ROOT = "https://www.crcv.ucf.edu/THUMOS14/UCF101/UCF101/"
_VIDEO_LIST = None
_CACHE_DIR = tempfile.mkdtemp()
# As of July 2020, crcv.ucf.edu doesn't use a certificate accepted by the
# default Colab environment anymore.
unverified_context = ssl._create_unverified_context()

def list_ucf_videos():
  """Lists videos available in UCF101 dataset."""
  global _VIDEO_LIST
  if not _VIDEO_LIST:
    index = request.urlopen(UCF_ROOT, context=unverified_context).read().decode("utf-8")
    videos = re.findall("(v_[\w_]+\.avi)", index)
    _VIDEO_LIST = sorted(set(videos))
  return list(_VIDEO_LIST)

def fetch_ucf_video(video):
  """Fetchs a video and cache into local filesystem."""
  cache_path = os.path.join(_CACHE_DIR, video)
  if not os.path.exists(cache_path):
    urlpath = request.urljoin(UCF_ROOT, video)
    print("Fetching %s => %s" % (urlpath, cache_path))
    data = request.urlopen(urlpath, context=unverified_context).read()
    open(cache_path, "wb").write(data)
  return cache_path

# Utilities to open video files using CV2
def crop_center_square(frame):
  y, x = frame.shape[0:2]
  min_dim = min(y, x)
  start_x = (x // 2) - (min_dim // 2)
  start_y = (y // 2) - (min_dim // 2)
  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]

def load_video(path, max_frames=0, resize=(224, 224)):
  cap = cv2.VideoCapture(path)
  frames = []
  try:
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      frame = crop_center_square(frame)
      frame = cv2.resize(frame, resize)
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)
      
      if len(frames) == max_frames:
        break
  finally:
    cap.release()
  return np.array(frames) / 255.0

def to_gif(images):
  converted_images = np.clip(images * 255, 0, 255).astype(np.uint8)
  imageio.mimsave('./animation.gif', converted_images, fps=25)
  return embed.embed_file('./animation.gif')

# Get the kinetics 600 label

In [None]:
# Get the kinetics-600 action labels from the GitHub repository.
KINETICS_URL = "https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt"
with request.urlopen(KINETICS_URL) as obj:
  labels = [line.decode("utf-8").strip() for line in obj.readlines()]
print("Found %d labels." % len(labels))

# Using the UCF101 dataset

In [None]:
# Get the list of videos in the dataset.
ucf_videos = list_ucf_videos()
  
categories = {}
for video in ucf_videos:
  category = video[2:-12]
  if category not in categories:
    categories[category] = []
  categories[category].append(video)
print("Found %d videos in %d categories." % (len(ucf_videos), len(categories)))

for category, sequences in categories.items():
  summary = ", ".join(sequences[:2])
  print("%-20s %4d videos (%s, ...)" % (category, len(sequences), summary))


In [None]:
# test visualization 
import matplotlib.pyplot as plt

categories_distribution = {key: len(value) for key,value in categories.items()}
plt.bar(range(len(categories_distribution)), list(categories_distribution.values()), align='center')
plt.xticks(range(len(categories_distribution)), list(categories_distribution.keys()))

In [None]:
# Get a sample cricket video.
video_path = fetch_ucf_video("v_VolleyballSpiking_g01_c01.avi")
sample_video = load_video(video_path)


In [None]:
sample_video.shape

In [None]:
i3d = hub.load("https://tfhub.dev/deepmind/i3d-kinetics-600/1").signatures['default']

# for future training if necessary
model = tf.keras.Sequential([
    hub.KerasLayer(i3d, input_shape=(None,224,224,3)),
])
model.summary()

In [None]:
def predict(sample_video):
  # Add a batch axis to the sample video.
  model_input = tf.constant(sample_video, dtype=tf.float32)[tf.newaxis,...]

  logits = model(model_input)[0]
  probabilities = tf.nn.softmax(logits)

  print("Top 5 actions:")
  for i in np.argsort(probabilities)[::-1][:5]:
    print(f"  {labels[i]:22}: {probabilities[i] * 100:5.2f}%")

In [None]:
predict(sample_video)

## Use this to test sample videos

In [None]:
def download_random_video():
    actions = list(categories.keys())
    action = random.choice(actions)
    index = random.randint(0,len(categories[action])-1)
    return categories[action][index]

In [None]:
random_video = download_random_video()
print(random_video)

In [None]:
video_path = fetch_ucf_video(random_video)
sample_video = load_video(video_path)[:50]
sample_video.shape

In [None]:
to_gif(sample_video)

In [None]:
predict(sample_video)

## Tried enhancements

The idea is to cut down the 600 labels to only 101, to do this I trained an individual video through an added dense layer of the number of classes for the UCF101 dataset. 

In [None]:
def download_random_video_with_label():
    actions = list(categories.keys())
    action = random.choice(actions)
    index = random.randint(0,len(categories[action])-1)
    return categories[action][index], actions.index(action)

In [None]:
new_model = tf.keras.Sequential([
    hub.KerasLayer(i3d, input_shape=(None,224,224,3)),
    tf.keras.layers.Dense(101),
])

new_model.summary()

In [None]:
new_model.compile(optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.CategoricalCrossentropy(),
    metrics=['accuracy'],
)

callbacks = [
    tf.keras.callbacks.TensorBoard(),
]

In [None]:
NUM_EPOCHS = 20
NUM_VIDEOS=100

batch_videos = np.zeros((NUM_VIDEOS,48,224,224,3))
batch_labels = np.zeros((NUM_VIDEOS,101))
for j in range(NUM_VIDEOS):
  video,index = download_random_video_with_label()
  items = list(categories.keys())
  
  print(video,index,items[index])

  video_path = fetch_ucf_video(video)
  train_video = load_video(video_path)[np.newaxis,...]
  batch_videos[j] = train_video[:,:48,:]
  
  label_list = [0 for i in range(101)]
  label_list[index] = 1
  label_list = np.asarray(label_list)
  label_list = label_list[np.newaxis,...]
  batch_labels[j] = label_list
  
batch_videos = np.asarray(batch_videos)
batch_labels = np.asarray(batch_labels)
new_model.fit(
    x=batch_videos,
    y=batch_labels,
    verbose=2,
    epochs=NUM_EPOCHS,
    batch_size=5,
)

In [None]:
test_video = download_random_video()
video_path = fetch_ucf_video(test_video)
sample_video = load_video(video_path)[:50]
sample_video.shape

In [None]:
def new_model_predict(sample_video):
  # Add a batch axis to the sample video.
  new_model_input = tf.constant(sample_video, dtype=tf.float32)[tf.newaxis,...] 

  logits = new_model(new_model_input)[0]
  probabilities = tf.nn.softmax(logits)

  print("Top 5 actions:")
  for i in np.argsort(probabilities)[::-1][:5]:
    actions = list(categories.keys())
    action = actions[i]
    print(f"  {action:22}: {probabilities[i] * 100:5.2f}%")

In [None]:
new_model_predict(sample_video)

In [None]:
to_gif(sample_video)