<h1>Demonstration of the POC for searching content inside video</h1>

Prerequisites


1.  Create a folder with name "input_videos" and put your video files as mp4
2.  Upload the action_model.tflite file
3.  Upload the object_model.tflite file
4.  Go through steps one by one

Update : V8 (19th Oct, 2023)




In [1]:
# @title 1. Install dependencies
!command -v ffmpeg >/dev/null || (apt update && apt install -y ffmpeg)
!pip install -q mediapy

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.6 MB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.6 MB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m1.0/1.6 MB[0m [31m10.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m11.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# @title 2. Import dependencies
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import mediapy as media
import numpy as np
import PIL
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tqdm
import absl.logging
import random
import re
import tempfile
import ssl
import cv2
import imageio
from IPython import display
from urllib import request

tf.get_logger().setLevel('ERROR')
absl.logging.set_verbosity(absl.logging.ERROR)
mpl.rcParams.update({
    'font.size': 10,
})

In [59]:
# @title 3. Define constants (if required)
#Common constants
input_videos_path = "input_videos"
output_videos_path = "output_videos"
video_fps = 5
seconds_per_segments = 5
threshold = 50
frames_per_video = seconds_per_segments * video_fps

#Action detection
action_frame_width = 224
action_frame_height = 224

#Object detection
object_frame_width = 320
object_frame_height = 320
class_names = [
      "person", "bicycle", "car", "motorcycle", "airplane",
      "bus", "train", "truck", "boat", "traffic light",
      "fire hydrant", "---", "stop sign", "parking meter", "bench",
      "bird", "cat", "dog", "horse", "sheep", "cow",
      "elephant", "bear", "zebra", "giraffe", "---", "backpack",
      "umbrella", "---", "---", "handbag", "tie", "suitcase", "frisbee",
      "skis", "snowboard", "sports ball", "kite", "baseball bat",
      "baseball glove", "skateboard", "surfboard", "tennis racket",
      "bottle", "---", "wine glass", "cup", "fork", "knife", "spoon",
      "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
      "carrot", "hot dog", "pizza", "donut", "cake", "chair",
      "couch", "potted plant", "bed", "---",  "dining table", "---", "---",
      "toilet", "---", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
      "microwave", "oven", "toaster", "sink", "refrigerator", "---", "book",
      "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "---"
  ]

In [12]:
# @title 4. Helper functions

# Download Kinetics 600 label map
!wget https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt -O labels.txt -q

with tf.io.gfile.GFile('labels.txt') as f:
  lines = f.readlines()
  KINETICS_600_LABELS_LIST = [line.strip() for line in lines]
  KINETICS_600_LABELS = tf.constant(KINETICS_600_LABELS_LIST)

def get_top_k(probs, k=5, label_map=KINETICS_600_LABELS):
  """Outputs the top k model labels and probabilities on the given video."""
  top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]
  top_labels = tf.gather(label_map, top_predictions, axis=-1)
  top_labels = [label.decode('utf8') for label in top_labels.numpy()]
  top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()
  return tuple(zip(top_labels, top_probs))

# Utilities to open video files using CV2
def crop_center_square(frame):
  y, x = frame.shape[0:2]
  min_dim = min(y, x)
  start_x = (x // 2) - (min_dim // 2)
  start_y = (y // 2) - (min_dim // 2)
  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]

def load_video(path, max_frames=0, resize=(object_frame_width, object_frame_height)):
  cap = cv2.VideoCapture(path)
  frames = []
  try:
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      frame = crop_center_square(frame)
      frame = cv2.resize(frame, resize)
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)

      if len(frames) == max_frames:
        break
  finally:
    cap.release()
  return np.array(frames)

In [6]:
# @title 5. Load pre-trained model
# Create the interpreter and signature runner
action_interpreter = tf.lite.Interpreter(model_path="action_model.tflite")
action_runner = action_interpreter.get_signature_runner()

object_interpreter = tf.lite.Interpreter(model_path="object_model.tflite")
object_interpreter.allocate_tensors()

In [7]:
# @title 6. Make necessary folders
if os.path.exists(input_videos_path) == False:
  os.mkdir(input_videos_path)

if os.path.exists(output_videos_path) == False:
  os.mkdir(output_videos_path)

In [8]:
# @title 7. Collect input videos (Mp4 only)
input_videos = []

for file in os.listdir("input_videos"):
    if file.endswith(".mp4"):
        input_videos.append(file)

if len(input_videos) == 0:
  print("There is no input videos")
else:
  print(input_videos)

['video1.mp4']


In [None]:
# @title 8. Convert the input mp4 video into AVI format
output_videos = []
for input_video in input_videos:
  input_path = input_videos_path + "/" + input_video
  path, filename_ext = os.path.split(input_path)
  filename = os.path.splitext(filename_ext)
  output_path = output_videos_path + "/" + filename[0] + ".avi"
  if os.path.exists(output_path):
    os.remove(output_path)
  print("Input : ", input_path)
  print("Output : ", output_path)
  !ffmpeg -i $input_path -filter:v fps=$video_fps $output_path
  output_videos.append(output_path)

In [28]:
# @title 9. Load the videos and show
action_videos = []
object_videos = []
for url in output_videos:
  object_video = load_video(url)
  action_video = load_video(url, resize=(action_frame_width, action_frame_height)) / 255
  # media.show_video(loaded_video, fps=video_fps)

  print("path : ", url)
  print("action video frames count, width, height, color : ", action_video.shape)
  print("object video frames count, width, height, color : ", object_video.shape)

  action_videos.append((url, action_video))
  object_videos.append((url, object_video))

path :  output_videos/video1.avi
action video frames count, width, height, color :  (77, 224, 224, 3)
object video frames count, width, height, color :  (77, 320, 320, 3)


In [29]:
# @title 10. Separate the videos in small segments
def separateVideoInSegments(url, video_frames):
  print("separateVideoInSegments() : ", url)
  video_segments = []
  video_segments_playable = []

  for idx, frame in enumerate(video_frames):
    video_id = int(idx / frames_per_video)
    try:
      f32_frame = tf.cast(frame, tf.float32)
      video_segments[video_id].append(f32_frame)
      video_segments_playable[video_id].append(frame)
    except IndexError:
      video_segments.append([f32_frame])
      video_segments_playable.append([frame])

  for idx, segment in enumerate(video_segments):
    print("Segment ", idx+1, ", total frames : ", len(segment))

  return (url, video_segments, video_segments_playable)

In [30]:
action_videos_segments = []
object_videos_segments = []

# Action videos
for url, video in action_videos:
  action_videos_segments.append(separateVideoInSegments(url, video))

for url, video in object_videos:
  object_videos_segments.append(separateVideoInSegments(url, video))

separateVideoInSegments() :  output_videos/video1.avi
Segment  1 , total frames :  25
Segment  2 , total frames :  25
Segment  3 , total frames :  25
Segment  4 , total frames :  2
separateVideoInSegments() :  output_videos/video1.avi
Segment  1 , total frames :  25
Segment  2 , total frames :  25
Segment  3 , total frames :  25
Segment  4 , total frames :  2


In [76]:
# @title 11. Define the detect function
def detectFirstPrint(video):
  init_states = {
    name: tf.zeros(x['shape'], dtype=x['dtype'])
    for name, x in action_runner.get_input_details().items()
  }
  del init_states['image']

  clips = video
  states = init_states
  for clip in clips:
    # Input shape: [1, 1, 224, 224, 3]
    outputs = action_runner(**states, image=clip)
    logits = outputs.pop('logits')[0]
    states = outputs

  probs = tf.nn.softmax(logits)
  top_k = get_top_k(probs)

  for label, prob in top_k:
    if prob * 100 >= threshold:
      print("     Action : " + label + ", with probability : " + str(prob * 100) + "%")
      return

  print("     Action : nothing detected")

In [77]:
# @title 12. Run inference

# Video action classification
for video in action_videos_segments:
  video_segments = video[1]
  video_segments_playable = video[2]

  print("Url : ", video[0])
  print(".............................")

  segment_id = 0
  for segment in video_segments:
    segment_start_time = segment_id * seconds_per_segments
    segment_end_time = (segment_id + 1) * seconds_per_segments
    print("Segment : ", segment_id+1, " , start : ", segment_start_time, " , end : ", segment_end_time)
    detectFirstPrint(video_segments[segment_id])
    # media.show_video(video_segments_playable[segment_id], fps=video_fps)
    segment_id = segment_id + 1

Url :  output_videos/video1.avi
.............................
Segment :  1  , start :  0  , end :  5
     Action : salsa dancing, with probability : 97.83188700675964%
Segment :  2  , start :  5  , end :  10
     Action : salsa dancing, with probability : 88.37751150131226%
Segment :  3  , start :  10  , end :  15
     Action : nothing detected
Segment :  4  , start :  15  , end :  20
     Action : nothing detected


In [78]:
object_input_details = object_interpreter.get_input_details()
object_output_details = object_interpreter.get_output_details()
object_input_shape = object_input_details[0]['shape'][1:3]

In [79]:
def detectObjects(video):
  for idx, frame in enumerate(video):
    if idx % video_fps == 0:
      # detected_objects_in_frame = []
      #plt.imshow(frame / 255)
      #plt.axis('off')
      #plt.show()
      int_frame = np.array(frame, dtype=np.uint8)
      int_frame = np.expand_dims(int_frame, axis=0)
      object_interpreter.set_tensor(object_input_details[0]['index'], int_frame)
      object_interpreter.invoke()
      classes = object_interpreter.get_tensor(object_output_details[1]['index'])
      scores = object_interpreter.get_tensor(object_output_details[2]['index'])
      num_detections = int(object_interpreter.get_tensor(object_output_details[3]['index']))

      detections = set()

      for k in range(num_detections):
        if scores[0, k] > (threshold / float(100)):
            class_id = int(classes[0, k])
            class_name = class_names[class_id]
            # confidence = scores[0, k]
            # label = f"class name = {class_name}, class id = {class_id + 1}, confidence = {confidence:.2f}"
            # print(label)
            # detected_objects_in_frame.append(label)
            detections.add(class_name)
      print("     Frame : ", idx, ", Detections : ", detections)


In [80]:
# Object detection
for video in object_videos_segments:
  video_segments = video[1]
  video_segments_playable = video[2]

  print("Url : ", video[0])
  print(".............................")

  segment_id = 0
  for segment in video_segments:
    segment_start_time = segment_id * seconds_per_segments
    segment_end_time = (segment_id + 1) * seconds_per_segments
    print("Segment : ", segment_id+1, " , start : ", segment_start_time, " , end : ", segment_end_time)
    detectObjects(video_segments[segment_id])
    # media.show_video(video_segments_playable[segment_id], fps=video_fps)
    segment_id = segment_id + 1

Url :  output_videos/video1.avi
.............................
Segment :  1  , start :  0  , end :  5
     Frame :  0 , Detections :  {'person'}
     Frame :  5 , Detections :  {'person'}
     Frame :  10 , Detections :  {'person', 'car'}
     Frame :  15 , Detections :  {'person'}
     Frame :  20 , Detections :  {'person'}
Segment :  2  , start :  5  , end :  10
     Frame :  0 , Detections :  {'person'}
     Frame :  5 , Detections :  {'person'}
     Frame :  10 , Detections :  {'person'}
     Frame :  15 , Detections :  {'person'}
     Frame :  20 , Detections :  {'person'}
Segment :  3  , start :  10  , end :  15
     Frame :  0 , Detections :  {'person'}
     Frame :  5 , Detections :  {'person', 'car'}
     Frame :  10 , Detections :  {'person'}
     Frame :  15 , Detections :  {'person'}
     Frame :  20 , Detections :  {'person'}
Segment :  4  , start :  15  , end :  20
     Frame :  0 , Detections :  {'person'}
