# SSD Video Inference and Summarization

In [34]:
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TerminateOnNaN, CSVLogger
from keras import backend as K
from keras.models import load_model
from math import ceil
import numpy as np
from matplotlib import pyplot as plt

from models.keras_ssd7 import build_model
from keras_loss_function.keras_ssd_loss import SSDLoss
from keras_layers.keras_layer_AnchorBoxes import AnchorBoxes
from keras_layers.keras_layer_DecodeDetections import DecodeDetections
from keras_layers.keras_layer_DecodeDetectionsFast import DecodeDetectionsFast

from ssd_encoder_decoder.ssd_input_encoder import SSDInputEncoder
from ssd_encoder_decoder.ssd_output_decoder import decode_detections, decode_detections_fast

from data_generator.object_detection_2d_data_generator import DataGenerator
from data_generator.object_detection_2d_misc_utils import apply_inverse_transforms
from data_generator.data_augmentation_chain_variable_input_size import DataAugmentationVariableInputSize
from data_generator.data_augmentation_chain_constant_input_size import DataAugmentationConstantInputSize
from data_generator.data_augmentation_chain_original_ssd import SSDDataAugmentation

from io import StringIO
from PIL import Image
from statistics import mean

import cv2
from timeit import default_timer as timer

%matplotlib inline

## Set the model configuration parameters

In [63]:
img_height = 300
img_width = 480
img_channels = 3
intensity_mean = None
intensity_range = None
n_classes = 5
scales = [0.08, 0.16, 0.32, 0.64, 0.96]
aspect_ratios = [0.5, 1.0, 2.0]
two_boxes_for_ar1 = True
steps = None
offsets = None
clip_boxes = False
variances = [1.0, 1.0, 1.0, 1.0]
normalize_coords = True

## Build or load the model

We're loading the trained weights and using the same parameters that were used for training it

In [32]:
K.clear_session()

model = build_model(image_size=(img_height, img_width, img_channels),
                    n_classes=n_classes,
                    mode='inference',
                    l2_regularization=0.0007,
                    scales=scales,
                    aspect_ratios_global=aspect_ratios,
                    aspect_ratios_per_layer=None,
                    two_boxes_for_ar1=two_boxes_for_ar1,
                    steps=steps,
                    offsets=offsets,
                    clip_boxes=clip_boxes,
                    variances=variances,
                    normalize_coords=normalize_coords,
                    subtract_mean=intensity_mean,
                    divide_by_stddev=intensity_range)

model.load_weights('C:/Users/Ignatios/Documents/ssd_keras/model_weights/custom/ssd7_epoch-19_loss-1.4077_val_loss-1.8736.h5', by_name=True)

adam = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0)

model.compile(optimizer=adam, loss=ssd_loss.compute_loss)

Define helper Classes for storing and analyzing frame details from the video's inference

In [35]:
class ObjectDetails:
    def __init__(self, class_obj, xmin, ymin, xmax, ymax):
        self.class_obj = class_obj
        self.xmin = xmin
        self.ymin = ymin
        self.xmax = xmax
        self.ymax = ymax

class AverageObjectDetails(ObjectDetails):
    def __init__(self, class_obj, visible_frames, visible_percentage, start_frame, end_frame, xmin, ymin, xmax, ymax):
        super().__init__(class_obj, xmin, ymin, xmax, ymax)
        self.visible_frames = visible_frames
        self.visible_percentage = visible_percentage
        self.start_frame = start_frame
        self.end_frame = end_frame
    
    def fill_frames(self):
        self.visible_frames = list(range(self.start_frame, self.end_frame + 1))
        self.visible_percentage = 1.0

class ObjectSummary:
    def __init__(self, class_num, visible_frames):
        self.class_num = class_num
        self.visible_frames = visible_frames
    
    def update_frames(self, frames):
        self.visible_frames.extend(frames)

class FrameDetails:
    def __init__(self, frame, frame_objects):
        self.frame = frame
        self.frame_objects = frame_objects

Define helper functions for pre-processing the video inference and applying slight enhancements on some frame details

In [41]:
# Accepts a list of all frames (and their details) and the video FPS output and
# breaks it down to half second windows (e.g. windows of 15 frames for videos at 30 FPS)
def break_in_half_second_windows(detections_in_seconds, out_frames):
    frame_window = int(out_frames / 2)
    frames_length = len(detections_in_seconds)
    frames_in_windows = []
    for i in range(0, frames_length, frame_window):
        frames_in_windows.append(detections_in_seconds[i:i + frame_window])
    
    return frames_in_windows

# Accepts two bounding boxes and calculates
# the Intersection Over Union (IOU) percentage
def intersection_over_union(boxA, boxB):
	xA = max(boxA.xmin, boxB.xmin)
	yA = max(boxA.ymin, boxB.ymin)
	xB = min(boxA.xmax, boxB.xmax)
	yB = min(boxA.ymax, boxB.ymax)

	interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)

	boxAArea = (boxA.xmax - boxA.xmin + 1) * (boxA.ymax - boxA.ymin + 1)
	boxBArea = (boxB.xmax - boxB.xmin + 1) * (boxB.ymax - boxB.ymin + 1)

	iou = interArea / float(boxAArea + boxBArea - interArea)

	return iou

# Analyzes the window of frames by gathering info
# regarding the average location of bounding boxes,
# the frames that each class is found at and the
# average percentage of participation
def analyze_window(window):
    window_details = {}
    object_intersections = {}
    start_frame = window[0].frame
    end_frame = window[-1].frame
    for frame_details in window:
        for frame_obj in frame_details.frame_objects:
            class_num = frame_obj.class_obj
            if class_num in window_details.keys():
                window_details[class_num][0].append(frame_obj.xmin)
                window_details[class_num][1].append(frame_obj.ymin)
                window_details[class_num][2].append(frame_obj.xmax)
                window_details[class_num][3].append(frame_obj.ymax)
                window_details[class_num][4].append(frame_details.frame)
            else:
                window_details[class_num] = []
                window_details[class_num].append([frame_obj.xmin])
                window_details[class_num].append([frame_obj.ymin])
                window_details[class_num].append([frame_obj.xmax])
                window_details[class_num].append([frame_obj.ymax])
                window_details[class_num].append([frame_details.frame])
    
    # Gather the average bounding box location and total frames
    # individually for each class found inside this window
    analyzed_classes = []
    for class_num in window_details.keys():
        xmin_average = mean(window_details[class_num][0])
        ymin_average = mean(window_details[class_num][1])
        xmax_average = mean(window_details[class_num][2])
        ymax_average = mean(window_details[class_num][3])
        visible_frames = window_details[class_num][4]
        visible_percentage = round(len(visible_frames)/len(window), 3)
        analyzed_classes.append(AverageObjectDetails(class_num, visible_frames, visible_percentage, start_frame, end_frame, xmin_average, ymin_average, xmax_average, ymax_average))
    
    return analyzed_classes

# Applies some enhancements on a window by deciding
# if the objects appearing are actually relevant or
# possible misclassifications
def optimize_window(analyzed_classes):
    analyzed_classes = list(filter(lambda ac: ac.visible_percentage >= 0.2, analyzed_classes))
    for analyzed_class in analyzed_classes:
        candidate_classes = list(filter(lambda c: c.class_obj != analyzed_class.class_obj, analyzed_classes))
        if len(analyzed_classes) > 1:
            for candidate in candidate_classes:
                if intersection_over_union(analyzed_class, candidate) > 0.5 and candidate.visible_percentage > analyzed_class.visible_percentage:
                    analyzed_classes = candidate_classes
                    break
        if analyzed_class.visible_percentage > 0.7 and analyzed_class.visible_percentage < 1.0:
            analyzed_class.fill_frames()
            
    return analyzed_classes

# Accepts the raw detections from video inference and applies transformations
# by breaking them down to windows, enhancing frames or removing objects that
# have been potentially misclassified by the predictor
def windowing_frame_fixing(detections_in_seconds, out_frames, classes):
    class_dict = {}
    for class_num in classes:
        class_dict[class_num] = ObjectSummary(class_num, [])

    half_second_windows = break_in_half_second_windows(detections_in_seconds, out_frames)

    for window in half_second_windows:
        analyzed_classes = analyze_window(window)
        filtered_classes = optimize_window(analyzed_classes)

        for filtered_class in filtered_classes:
            class_dict[filtered_class.class_obj].update_frames(filtered_class.visible_frames)
    
    return class_dict


Define helper functions for extracting summary details for each class of interest, as well as co-appearance statistics

In [61]:
# Returns the intersection of frames
def co_appearance(lst1, lst2): 
    lst_to_set = frozenset(lst2)
    return [x for x in lst1 if x in lst_to_set]

# Accepts an array of all the frames where an object appears and
# the framerate of the video. It breaks it down to non - continuous
# parts and then converts them to distinct arrays of start - end
# pairs in seconds
def frames_to_distinct_scenes(class_frames, out_frames):
    # return if the object has not been detected
    if not class_frames: return []

    # break down frames into non - continuous scenes
    distinct_scenes = [[class_frames[0]]]
    for i in range(len(class_frames)):
        if i > 0:
            last_frame = distinct_scenes[-1][-1]
            frame_diff = class_frames[i] - last_frame
            if (frame_diff > 1):
                distinct_scenes.append([class_frames[i]])
            else:
                distinct_scenes[-1].append(class_frames[i])

    # keep the first and last frame of each scene, converted in seconds
    scenes_in_seconds = []
    for scene in distinct_scenes:
        start_in_seconds = round(scene[0] / out_frames, 2)
        end_in_seconds = round(scene[-1] / out_frames, 2)
        scenes_in_seconds.append([start_in_seconds, end_in_seconds])
    
    return scenes_in_seconds

# Accepts a class of interest, an array with details for every frame of the clip
# and the FPS of the video. Extracts analytical details for the class of interest
# and the classes that appeared with it at the same frames
def extract_summary_for_class(class_num, optimized_frames, out_frames):
    class_of_interest = optimized_frames[class_num]
    scenes_in_seconds = frames_to_distinct_scenes(class_of_interest.visible_frames, out_frames)

    co_appearance_frames = {}
    for class_key in optimized_frames.keys():
        if class_key != class_num:
            same_frames = co_appearance(class_of_interest.visible_frames, optimized_frames[class_key].visible_frames)
            co_appearance_frames[class_key] = same_frames
    
    return len(class_of_interest.visible_frames), scenes_in_seconds, co_appearance_frames

# Accepts a dictionary of classes that co-appeared with a particular class
# and extracts an analytical summary of time and duration, relative to the
# framerate of the video of interest
def extract_co_appearance_stats(co_appearance_frames, out_frames):
    stats_per_class = {}

    for co_appearance in co_appearance_frames:
        class_frames = co_appearance_frames[co_appearance]
        scenes_in_seconds = frames_to_distinct_scenes(class_frames, out_frames)
        stats_per_class.update({co_appearance: scenes_in_seconds})
    
    return stats_per_class

# Accept a time step in seconds and converts
# it to minutes, seconds, miliseconds format
def format_time(seconds): 
    ms, sec = np.modf(seconds)
    ms = (round(ms, 2)) * 100
    min, sec = divmod(sec, 60) 
    return "%01d:%02d.%02d" % (min, sec, ms)

# Prints a time frame in a formatted way
def print_time_frame(distinct_scene):
    if round(distinct_scene[1] - distinct_scene[0], 2) > 0.0:
        print('At {} for {} seconds'.format(format_time(distinct_scene[0]), round(distinct_scene[1] - distinct_scene[0], 2)))

Define helper functions for processing the video, making predictions using the model and drawing bounding boxes

In [38]:
VIDEO_TYPE = {
    'avi': cv2.VideoWriter_fourcc(*'XVID'),
    'mp4': cv2.VideoWriter_fourcc(*'XVID'),
}

# Cleans multiple bounding boxes for each class by selecting the one with
# the highest confidence, since we care about unique classes in each frame
def clean_multiple_confidence_boxes(positive_classes, y_pred_thresh):
    filtered_confidence_boxes = []
    for positive_class in positive_classes:
        class_boxes = list(filter(lambda c: c[0] == positive_class, y_pred_thresh))
        if class_boxes:
            filtered_confidence_boxes.append(max(class_boxes, key = lambda c: c[1]))
    return filtered_confidence_boxes

def get_video_type(filename):
    filename, ext = os.path.splitext(filename)
    if ext in VIDEO_TYPE:
      return  VIDEO_TYPE[ext]
    return VIDEO_TYPE['avi']

# Applies the model on every frame of the video and
# draws bounding boxes over the detected objects as
# well as the FPS at which the video is processed
def video_inference():
    cap = cv2.VideoCapture(video_path)

    # Get original video width and height
    orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Create a VideoWriter object to save the output video
    out = cv2.VideoWriter(save_path, get_video_type(save_path), out_frames, (orig_width, orig_height))

    all_frames_objects = []
    accum_time = 0
    curr_fps = 0
    curr_frame = 0
    fps = "FPS: ??"
    prev_time = timer()

    # Open the video and read frame by frame
    while(cap.isOpened()):
        ret, frame = cap.read()

        if ret == False:
            break

        # Convert the frame to RGB format
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Resize the frame to model input acceptable dimensions
        input_images = []
        img = cv2.resize(rgb_frame, (img_width, img_height))
        input_images.append(img)
        input_images = np.array(input_images)

        y_pred = model.predict(input_images)

        # Get aspect ratio between original and model input dimensions
        width_aspect = frame.shape[1] / img_width
        height_aspect = frame.shape[0] / img_height

        # Filter out the predictions that are below the defined confidence threshold
        y_pred_thresh = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]

        # Filter out multiple bounding boxes with lower confidence for the same class
        y_pred_thresh = clean_multiple_confidence_boxes(numerical_positive_classes, y_pred_thresh[0])

        frame_objects = []

        # Calculate and draw the bounding boxes for each predicted class
        for box in y_pred_thresh:
            xmin = int(round(box[2] * width_aspect))
            ymin = int(round(box[3] * height_aspect))
            xmax = int(round(box[4] * width_aspect))
            ymax = int(round(box[5] * height_aspect))
            color = colors[int(box[0])]
            label = '{}: {:.2f}'.format(classes[int(box[0])], box[1])

            frame_objects.append(ObjectDetails(int(box[0]), xmin, ymin, xmax, ymax))

            tl=(xmin, ymin)
            br=(xmax, ymax)

            cv2.rectangle(frame, tl, br, color, 2)
            
            text_top = (xmin, ymin-10)
            text_bot = (xmin + 80, ymin + 5)
            text_pos = (xmin + 5, ymin)
            cv2.rectangle(frame, text_top, text_bot, color, -1)
            cv2.putText(frame, label, text_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255,255,255), 1)
        
        # Add the prediction details for summarization purposes
        all_frames_objects.append(FrameDetails(curr_frame, frame_objects))
        curr_frame = curr_frame + 1

        # Calculate FPS
        curr_time = timer()
        exec_time = curr_time - prev_time
        prev_time = curr_time
        accum_time = accum_time + exec_time
        curr_fps = curr_fps + 1
        
        if accum_time > 1:
            accum_time = accum_time - 1
            fps = "FPS: " + str(curr_fps)
            curr_fps = 0
        
        # Draw FPS on the top left corner
        cv2.rectangle(frame, (0,0), (50, 17), (255,255,255), -1)
        cv2.putText(frame, fps, (3,10), cv2.FONT_HERSHEY_SIMPLEX, 0.35, (0,0,0), 1)

        cv2.imshow('frame', frame)
        out.write(frame)

        # Access frames on max speed and wait for closing signal
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    out.release()
    cap.release()
    cv2.destroyAllWindows()

    return all_frames_objects

## Video Inference

- Select the input video and output video save location / framerate
- Set a confidence threshold for bounding box inference
- Set the desired bounding box colour to be drawn for each class
- Set the classes to be predicted from the video

In [39]:
out_frames = 30
save_path = 'C:/Users/Ignatios/Downloads/videoplayback_out.avi'
video_path = 'C:/Users/Ignatios/Downloads/final_video.mp4'

confidence_threshold = 0.5

colors = [(0,0,0), (32,178,170), (34,139,34), (34,139,34), (34,139,34), (34,139,34)]

classes = ['background', 'BB8', 'R2D2', 'Yoda', 'Chewbacca', 'Stormtrooper']
numerical_positive_classes = [1, 2, 3, 4, 5]

all_frames_detections = video_inference()

## Video Summary

- Pre-process the collected frames by applying enhancements and filtering possible misclassifications
- Iterate over all classes of interest
- Extract video summary for every class
- Extract co-appearance summary for every class
- Print summary details in a formatted way

In [62]:
classes=['BB8', 'R2D2', 'Yoda', 'Chewbacca', 'Stormtrooper']

optimized_frames = windowing_frame_fixing(all_frames_detections, out_frames, numerical_positive_classes)

for selected_class, name in enumerate(classes):
    total_frames, scenes_in_seconds, co_appearance_frames = extract_summary_for_class(selected_class + 1, optimized_frames, out_frames)
    extract_summary_for_class(selected_class + 1, optimized_frames, out_frames)
    print('Total screen time for {} is {} seconds'.format(classes[selected_class], round(total_frames/out_frames, 2)))
    print('\nSpecifically, {} appeared:'.format(classes[selected_class]))
    for distinct_scene in scenes_in_seconds:
         print_time_frame(distinct_scene)

    stats_per_class = extract_co_appearance_stats(co_appearance_frames, out_frames)
    for co_appeared_object in stats_per_class:
        if len(stats_per_class[co_appeared_object]):
            print('\n{} co-appeared with {}'.format(classes[selected_class], classes[co_appeared_object - 1]))
            for distinct_scene in stats_per_class[co_appeared_object]:
                print_time_frame(distinct_scene)
    print('\n\n')


Total screen time for BB8 is 39.7 seconds

Specifically, BB8 appeared:
At 0:11.40 for 1.73 seconds
At 0:14.50 for 0.47 seconds
At 0:17.77 for 1.4 seconds
At 0:22.50 for 0.8 seconds
At 0:24.87 for 8.43 seconds
At 0:39.70 for 1.77 seconds
At 0:41.77 for 0.06 seconds
At 0:42.00 for 1.77 seconds
At 0:44.33 for 1.64 seconds
At 0:46.67 for 5.36 seconds
At 0:52.37 for 0.16 seconds
At 0:52.73 for 1.74 seconds
At 0:54.73 for 0.9 seconds
At 0:56.40 for 0.57 seconds
At 0:57.20 for 0.13 seconds
At 0:57.50 for 0.57 seconds
At 0:58.33 for 3.84 seconds
At 1:03.90 for 1.07 seconds
At 1:05.50 for 2.47 seconds
At 1:09.20 for 1.87 seconds
At 1:12.00 for 1.07 seconds
At 1:14.87 for 1.1 seconds

BB8 co-appeared with R2D2
At 0:24.87 for 3.86 seconds

BB8 co-appeared with Yoda
At 1:00.37 for 0.1 seconds

BB8 co-appeared with Chewbacca
At 0:42.50 for 0.13 seconds



Total screen time for R2D2 is 13.07 seconds

Specifically, R2D2 appeared:
At 0:03.00 for 2.7 seconds
At 0:16.17 for 1.56 seconds
At 0:23.33 for 5