<a href="https://colab.research.google.com/github/rifathaqueamit/poc_development/blob/develop/poc_v_11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Demonstration of the POC for searching content inside video</h1>

Prerequisites

Go through steps one by one

Update : V11 (24th Oct, 2023)




In [1]:
# @title 1. Install dependencies
!command -v ffmpeg >/dev/null || (apt update && apt install -y ffmpeg)
!pip install -q mediapy

In [2]:
# @title 2. Import dependencies
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import mediapy as media
import numpy as np
import PIL
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import tqdm
import absl.logging
import random
import re
import tempfile
import ssl
import cv2
import imageio
import requests
import json
import glob

from IPython import display
from urllib import request

tf.get_logger().setLevel('ERROR')
absl.logging.set_verbosity(absl.logging.ERROR)
mpl.rcParams.update({
    'font.size': 10,
})

In [3]:
# @title 3. Define constants (if required)
#Common constants
input_videos_path = "input_videos"
output_videos_path = "output_videos"
video_fps = 5
seconds_per_segments = 15
threshold = 50
frames_per_video = seconds_per_segments * video_fps

#Action detection
action_frame_width = 224
action_frame_height = 224

#Object detection
object_frame_width = 320
object_frame_height = 320
class_names = [
      "person", "bicycle", "car", "motorcycle", "airplane",
      "bus", "train", "truck", "boat", "traffic light",
      "fire hydrant", "---", "stop sign", "parking meter", "bench",
      "bird", "cat", "dog", "horse", "sheep", "cow",
      "elephant", "bear", "zebra", "giraffe", "---", "backpack",
      "umbrella", "---", "---", "handbag", "tie", "suitcase", "frisbee",
      "skis", "snowboard", "sports ball", "kite", "baseball bat",
      "baseball glove", "skateboard", "surfboard", "tennis racket",
      "bottle", "---", "wine glass", "cup", "fork", "knife", "spoon",
      "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
      "carrot", "hot dog", "pizza", "donut", "cake", "chair",
      "couch", "potted plant", "bed", "---",  "dining table", "---", "---",
      "toilet", "---", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
      "microwave", "oven", "toaster", "sink", "refrigerator", "---", "book",
      "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush", "---"
  ]

In [4]:
# @title 4. Get necessary files

#Create folders
if os.path.exists(input_videos_path) == False:
  os.mkdir(input_videos_path)

if os.path.exists(output_videos_path) == False:
  os.mkdir(output_videos_path)

# Clean folders
files = glob.glob('/content/' + input_videos_path + '/*')
for f in files:
  os.remove(f)

files = glob.glob('/content/' + output_videos_path + '/*')
for f in files:
  os.remove(f)

# Load models
if os.path.exists("/content/action_model.tflite") == False:
  !wget https://raw.githubusercontent.com/rifathaqueamit/poc_development/develop/action_model.tflite

if os.path.exists("/content/object_model.tflite") == False:
  !wget https://raw.githubusercontent.com/rifathaqueamit/poc_development/develop/object_model.tflite

# Load input videos
res = requests.get('https://github.com/rifathaqueamit/poc_development/tree/develop/input_videos')
data = json.loads(res.text)
files = data['payload']['tree']['items']

for file_item in files:
  name = file_item['name']
  path = "https://raw.githubusercontent.com/rifathaqueamit/poc_development/develop/input_videos/" + name
  path = path.replace(" ", "%20")
  !wget $path
  os.rename("/content/" + name, '/content/' + input_videos_path + '/' + name)
  print("Loaded : ", name)


--2023-10-24 09:48:06--  https://raw.githubusercontent.com/rifathaqueamit/poc_development/develop/input_videos/video11.mp4
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4403240 (4.2M) [application/octet-stream]
Saving to: ‘video11.mp4’


2023-10-24 09:48:06 (46.4 MB/s) - ‘video11.mp4’ saved [4403240/4403240]

Loaded :  video11.mp4
--2023-10-24 09:48:06--  https://raw.githubusercontent.com/rifathaqueamit/poc_development/develop/input_videos/video12.mp4
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17911680 (17M) [application/oc

In [5]:
# @title 5. Helper functions

# Download Kinetics 600 label map
!wget https://raw.githubusercontent.com/tensorflow/models/f8af2291cced43fc9f1d9b41ddbf772ae7b0d7d2/official/projects/movinet/files/kinetics_600_labels.txt -O labels.txt -q

with tf.io.gfile.GFile('labels.txt') as f:
  lines = f.readlines()
  KINETICS_600_LABELS_LIST = [line.strip() for line in lines]
  KINETICS_600_LABELS = tf.constant(KINETICS_600_LABELS_LIST)

def get_top_k(probs, k=5, label_map=KINETICS_600_LABELS):
  """Outputs the top k model labels and probabilities on the given video."""
  top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]
  top_labels = tf.gather(label_map, top_predictions, axis=-1)
  top_labels = [label.decode('utf8') for label in top_labels.numpy()]
  top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()
  return tuple(zip(top_labels, top_probs))

# Utilities to open video files using CV2
def crop_center_square(frame):
  y, x = frame.shape[0:2]
  min_dim = min(y, x)
  start_x = (x // 2) - (min_dim // 2)
  start_y = (y // 2) - (min_dim // 2)
  return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]

def load_video(path, max_frames=0, resize=(object_frame_width, object_frame_height)):
  cap = cv2.VideoCapture(path)
  frames = []
  try:
    while True:
      ret, frame = cap.read()
      if not ret:
        break
      frame = crop_center_square(frame)
      frame = cv2.resize(frame, resize)
      frame = frame[:, :, [2, 1, 0]]
      frames.append(frame)

      if len(frames) == max_frames:
        break
  finally:
    cap.release()
  return np.array(frames)

In [6]:
# @title 6. Load pre-trained model
# Create the interpreter and signature runner
action_interpreter = tf.lite.Interpreter(model_path="action_model.tflite")
action_runner = action_interpreter.get_signature_runner()

object_interpreter = tf.lite.Interpreter(model_path="object_model.tflite")
object_interpreter.allocate_tensors()

In [7]:
# @title 7. Collect input videos (Mp4 only)
input_videos = []

for file in os.listdir("input_videos"):
    if file.endswith(".mp4"):
        input_videos.append(file)

if len(input_videos) == 0:
  print("There is no input videos")
else:
  print(input_videos)

['video9.mp4', 'video2.mp4', 'video12.mp4', 'video8.mp4', 'video14.mp4', 'video17.mp4', 'video16.mp4', 'video6.mp4', 'video4.mp4', 'video15.mp4', 'video3.mp4', 'video11.mp4']


In [8]:
# @title 8. Convert the input mp4 video into AVI format
output_videos = []
for input_video in input_videos:
  input_path = input_videos_path + "/" + input_video
  path, filename_ext = os.path.split(input_path)
  filename = os.path.splitext(filename_ext)
  output_path = output_videos_path + "/" + filename[0] + ".avi"
  if os.path.exists(output_path):
    os.remove(output_path)
  print("Input : ", input_path)
  print("Output : ", output_path)
  !ffmpeg -i $input_path -filter:v fps=$video_fps $output_path
  output_videos.append(output_path)

Input :  input_videos/video9.mp4
Output :  output_videos/video9.avi
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libt

In [9]:
# @title 9. Load the videos
action_videos = []
object_videos = []
for url in output_videos:
  object_video = load_video(url)
  action_video = load_video(url, resize=(action_frame_width, action_frame_height)) / 255
  # media.show_video(loaded_video, fps=video_fps)

  print("path : ", url)
  print("action video frames count, width, height, color : ", action_video.shape)
  print("object video frames count, width, height, color : ", object_video.shape)

  action_videos.append((url, action_video))
  object_videos.append((url, object_video))

path :  output_videos/video9.avi
action video frames count, width, height, color :  (108, 224, 224, 3)
object video frames count, width, height, color :  (108, 320, 320, 3)
path :  output_videos/video2.avi
action video frames count, width, height, color :  (295, 224, 224, 3)
object video frames count, width, height, color :  (295, 320, 320, 3)
path :  output_videos/video12.avi
action video frames count, width, height, color :  (295, 224, 224, 3)
object video frames count, width, height, color :  (295, 320, 320, 3)
path :  output_videos/video8.avi
action video frames count, width, height, color :  (251, 224, 224, 3)
object video frames count, width, height, color :  (251, 320, 320, 3)
path :  output_videos/video14.avi
action video frames count, width, height, color :  (77, 224, 224, 3)
object video frames count, width, height, color :  (77, 320, 320, 3)
path :  output_videos/video17.avi
action video frames count, width, height, color :  (236, 224, 224, 3)
object video frames count, widt

In [10]:
# @title 10. Separate the videos in small segments
def separateVideoInSegments(url, video_frames):
  print("separateVideoInSegments() : ", url)
  video_segments = []
  video_segments_playable = []

  for idx, frame in enumerate(video_frames):
    video_id = int(idx / frames_per_video)
    try:
      f32_frame = tf.cast(frame, tf.float32)
      video_segments[video_id].append(f32_frame)
      video_segments_playable[video_id].append(frame)
    except IndexError:
      video_segments.append([f32_frame])
      video_segments_playable.append([frame])

  for idx, segment in enumerate(video_segments):
    print("Segment ", idx+1, ", total frames : ", len(segment))

  return (url, video_segments, video_segments_playable)

In [11]:
action_videos_segments = []
object_videos_segments = []
video_segments_detection = {}
object_segments_detection = {}

# Action videos
for url, video in action_videos:
  action_videos_segments.append(separateVideoInSegments(url, video))

for url, video in object_videos:
  object_videos_segments.append(separateVideoInSegments(url, video))

for url, video in action_videos:
  video_segments_detection[url] = []
  object_segments_detection[url] = []

separateVideoInSegments() :  output_videos/video9.avi
Segment  1 , total frames :  75
Segment  2 , total frames :  33
separateVideoInSegments() :  output_videos/video2.avi
Segment  1 , total frames :  75
Segment  2 , total frames :  75
Segment  3 , total frames :  75
Segment  4 , total frames :  70
separateVideoInSegments() :  output_videos/video12.avi
Segment  1 , total frames :  75
Segment  2 , total frames :  75
Segment  3 , total frames :  75
Segment  4 , total frames :  70
separateVideoInSegments() :  output_videos/video8.avi
Segment  1 , total frames :  75
Segment  2 , total frames :  75
Segment  3 , total frames :  75
Segment  4 , total frames :  26
separateVideoInSegments() :  output_videos/video14.avi
Segment  1 , total frames :  75
Segment  2 , total frames :  2
separateVideoInSegments() :  output_videos/video17.avi
Segment  1 , total frames :  75
Segment  2 , total frames :  75
Segment  3 , total frames :  75
Segment  4 , total frames :  11
separateVideoInSegments() :  outpu

In [12]:
# @title 11. Define the detect function for video classification
def detectFirstPrint(video):
  init_states = {
    name: tf.zeros(x['shape'], dtype=x['dtype'])
    for name, x in action_runner.get_input_details().items()
  }
  del init_states['image']

  clips = video
  states = init_states
  for clip in clips:
    # Input shape: [1, 1, 224, 224, 3]
    outputs = action_runner(**states, image=clip)
    logits = outputs.pop('logits')[0]
    states = outputs

  probs = tf.nn.softmax(logits)
  top_k = get_top_k(probs)

  for label, prob in top_k:
    if prob * 100 >= threshold:
      print("     Action : " + label + ", with probability : " + str(prob * 100) + "%")
      video_segments_detection[url].append(label)
      return

  print("     Action : nothing detected")
  video_segments_detection[url].append("nothing")

In [13]:
# @title 12. Run inference for video classification

# Video action classification
for video in action_videos_segments:
  url = video[0]
  video_segments = video[1]
  video_segments_playable = video[2]

  video_segments_detection[url].clear()

  print("Url : ", url)
  print(".............................")

  segment_id = 0
  for segment in video_segments:
    segment_start_time = segment_id * seconds_per_segments
    segment_end_time = (segment_id + 1) * seconds_per_segments
    print("Segment : ", segment_id+1, " , start : ", segment_start_time, " , end : ", segment_end_time)
    detectFirstPrint(video_segments[segment_id])
    # media.show_video(video_segments_playable[segment_id], fps=video_fps)
    segment_id = segment_id + 1

Url :  output_videos/video9.avi
.............................
Segment :  1  , start :  0  , end :  15
     Action : playing tennis, with probability : 56.42029047012329%
Segment :  2  , start :  15  , end :  30
     Action : nothing detected
Url :  output_videos/video2.avi
.............................
Segment :  1  , start :  0  , end :  15
     Action : shooting goal (soccer), with probability : 65.00195860862732%
Segment :  2  , start :  15  , end :  30
     Action : shooting goal (soccer), with probability : 91.71982407569885%
Segment :  3  , start :  30  , end :  45
     Action : shooting goal (soccer), with probability : 77.54311561584473%
Segment :  4  , start :  45  , end :  60
     Action : shooting goal (soccer), with probability : 64.62652087211609%
Url :  output_videos/video12.avi
.............................
Segment :  1  , start :  0  , end :  15
     Action : punching person (boxing), with probability : 92.71236062049866%
Segment :  2  , start :  15  , end :  30
     Ac

In [14]:
# @title 13. Prepare details for object interpreter
object_input_details = object_interpreter.get_input_details()
object_output_details = object_interpreter.get_output_details()
object_input_shape = object_input_details[0]['shape'][1:3]

In [15]:
# @title 14. Define object detection function
def detectObjects(segmentId, video):
  all_detections = set()
  for idx, frame in enumerate(video):
    if idx % video_fps == 0:
      # detected_objects_in_frame = []
      #plt.imshow(frame / 255)
      #plt.axis('off')
      #plt.show()
      int_frame = np.array(frame, dtype=np.uint8)
      int_frame = np.expand_dims(int_frame, axis=0)
      object_interpreter.set_tensor(object_input_details[0]['index'], int_frame)
      object_interpreter.invoke()
      classes = object_interpreter.get_tensor(object_output_details[1]['index'])
      scores = object_interpreter.get_tensor(object_output_details[2]['index'])
      num_detections = int(object_interpreter.get_tensor(object_output_details[3]['index']))

      detections = set()

      for k in range(num_detections):
        if scores[0, k] > (threshold / float(100)):
            class_id = int(classes[0, k])
            class_name = class_names[class_id]
            # confidence = scores[0, k]
            # label = f"class name = {class_name}, class id = {class_id + 1}, confidence = {confidence:.2f}"
            # print(label)
            # detected_objects_in_frame.append(label)
            detections.add(class_name)
      # print("     Frame : ", idx, ", Detections : ", detections)
      for item in detections:
        all_detections.add(item)
  print("Segment detections : ", all_detections)
  return all_detections

In [16]:
# @title 15. Run inference for object detection
for video in object_videos_segments:
  url = video[0]
  video_segments = video[1]
  video_segments_playable = video[2]

  object_segments_detection[url].clear()

  print("Url : ", url)
  print(".............................")

  segment_id = 0
  for segment in video_segments:
    segment_start_time = segment_id * seconds_per_segments
    segment_end_time = (segment_id + 1) * seconds_per_segments
    print("Segment : ", segment_id+1, " , start : ", segment_start_time, " , end : ", segment_end_time)
    detections = detectObjects(segment_id, video_segments[segment_id])
    object_segments_detection[url].append(detections)
    # media.show_video(video_segments_playable[segment_id], fps=video_fps)
    segment_id = segment_id + 1

Url :  output_videos/video9.avi
.............................
Segment :  1  , start :  0  , end :  15
Segment detections :  {'person', 'tennis racket'}
Segment :  2  , start :  15  , end :  30
Segment detections :  {'person', 'cat'}
Url :  output_videos/video2.avi
.............................
Segment :  1  , start :  0  , end :  15
Segment detections :  {'person', 'fire hydrant', 'bird', 'bottle'}
Segment :  2  , start :  15  , end :  30
Segment detections :  {'person', 'dog'}
Segment :  3  , start :  30  , end :  45
Segment detections :  {'person'}
Segment :  4  , start :  45  , end :  60
Segment detections :  {'person', 'bottle'}
Url :  output_videos/video12.avi
.............................
Segment :  1  , start :  0  , end :  15
Segment detections :  {'person', 'teddy bear'}
Segment :  2  , start :  15  , end :  30
Segment detections :  {'person', 'tv'}
Segment :  3  , start :  30  , end :  45
Segment detections :  {'person'}
Segment :  4  , start :  45  , end :  60
Segment detect

In [44]:
# @title 16. Collect everything together
individual_segment_wise_detections = []
video_segment_detections = {}

for url, video in action_videos:
  video_segment_detections[url] = []

for url, detections in video_segments_detection.items():
  print(detections)
  individual_segment_wise_detections.clear()
  segment_id = 0
  for dect in detections:
    if dect != "nothing":
      individual_segment_wise_detections.append({dect})
    else:
      individual_segment_wise_detections.append(set())
    for setItem in object_segments_detection[url][segment_id]:
      individual_segment_wise_detections[len(individual_segment_wise_detections) - 1].add(setItem)
    segment_id = segment_id + 1
  video_segment_detections[url] = individual_segment_wise_detections.copy()




print("Videos detections : ")
print(video_segment_detections)

['playing tennis', 'nothing']
['shooting goal (soccer)', 'shooting goal (soccer)', 'shooting goal (soccer)', 'shooting goal (soccer)']
['punching person (boxing)', 'punching person (boxing)', 'punching person (boxing)', 'punching person (boxing)']
['nothing', 'riding or walking with horse', 'riding or walking with horse', 'riding or walking with horse']
['salsa dancing', 'nothing']
['scrambling eggs', 'nothing', 'making a cake', 'making a cake']
['nothing', 'nothing', 'nothing']
['nothing', 'nothing', 'motorcycling', 'nothing', 'nothing', 'nothing', 'nothing', 'nothing', 'nothing']
['driving car', 'driving car', 'driving car', 'driving car', 'driving car', 'driving car', 'driving car', 'nothing']
['nothing', 'nothing']
['chopping vegetables']
['nothing', 'nothing']
Videos detections : 
{'output_videos/video9.avi': [{'person', 'tennis racket', 'playing tennis'}, {'person', 'cat'}], 'output_videos/video2.avi': [{'bird', 'bottle', 'fire hydrant', 'shooting goal (soccer)', 'person'}, {'dog

In [60]:
# @title 17. Some more utilities
def searchAtleastOneTerm(terms, text):
  for term in terms:
    if term in text:
      return True
  return False

def searchAllTerms(terms, text):
  found_terms = 0
  for term in terms:
    if term in text:
      found_terms = found_terms + 1
  if found_terms == len(terms):
    print(found_terms)
    return True
  return False

In [65]:
# @title 18. Demonstration by video
search = "soccer playing" # @param {type:"string"}
search_terms = search.lower().split()

if len(search_terms) > 0:
  found_videos = []
  for url, detections in video_segment_detections.items():
    for segment in detections:
      for item in segment:
        if searchAtleastOneTerm(search_terms, item):
          found_videos.append(url)
      else:
        continue
      break
    else:
      continue
    break
  for url, video in object_videos:
    for item in found_videos:
      if url == item:
        media.show_video(video, fps=video_fps)
        break

0
This browser does not support the video tag.


0
This browser does not support the video tag.
