In [1]:
import urllib3, requests, json, os, pickle
import numpy as np

# HACK: I don't want to see certificate warnings from olimar.stanford.edu
urllib3.disable_warnings()

In [2]:
# location of the video metadata file.
# It is assumed that video data is located relative to this file.
VIDEO_COLLECTION_BASEURL = "http://olimar.stanford.edu/hdd/intel_self_driving/" 
VIDEO_METADATA_FILENAME = "intel_metadata.json"

In [3]:
# Grab the metadata (width, height, number of frames, FPS) of my video collection from olimar
req = requests.get(os.path.join(VIDEO_COLLECTION_BASEURL, VIDEO_METADATA_FILENAME), verify=False)
video_collection = sorted(req.json(), key=lambda vm: vm['filename'])
print("The video collection has %d videos." % len(video_collection))

The video collection has 5 videos.


In [4]:
# Names of the maskrcnn files
maskrcnn_bbox_files = [ 'maskrcnn_bboxes_0001.pkl', 'maskrcnn_bboxes_0002.pkl', 'maskrcnn_bboxes_0003.pkl',
                  'maskrcnn_bboxes_0004.pkl', 'maskrcnn_bboxes_0005.pkl' ]

# Names of the cyclist files
cyclist_bbox_files = [ 'cyclist_labels_0001.pkl', 'cyclist_labels_0002.pkl', 'cyclist_labels_0003.pkl',
                 'cyclist_labels_0004.pkl', 'cyclist_labels_0005.pkl' ]

In [5]:
# Now, load the bounding boxes from olimar.
# We load a list of lists of bboxes from olimar, and put those into a sorted list by bbox file name.
maskrcnn_bboxes = []
for bbox_file in maskrcnn_bbox_files:
    req = requests.get(os.path.join(VIDEO_COLLECTION_BASEURL, bbox_file), verify=False)
    maskrcnn_bboxes.append(pickle.loads(req.content))
    
cyclist_bboxes = []
for bbox_file in cyclist_bbox_files:
    req = requests.get(os.path.join(VIDEO_COLLECTION_BASEURL, bbox_file), verify=False)
    cyclist_bboxes.append(pickle.loads(req.content))


In [6]:
from rekall import Interval, IntervalSet, IntervalSetMapping, Bounds3D
from vgrid import VGridSpec, VideoMetadata, VideoBlockFormat
from vgrid_jupyter import VGridWidget

# Load the video metadata into VideoMetadata objects, using filename for the id
video_metadata = [
    VideoMetadata(v["filename"], v["filename"], v["fps"], v["num_frames"], v["width"], v["height"])
    for v in video_collection
]

# Load the maskrcnn bboxes into Rekall, using video id as key
# Units of Bounds are seconds for time, relative units for X and Y
maskrcnn_bboxes_ism = IntervalSetMapping({
    vm.id: IntervalSet([
        Interval(
            Bounds3D(
                t1 = frame_num / vm.fps,
                t2 = (frame_num + 1) / vm.fps,
                x1 = bbox[0] / vm.width,
                x2 = bbox[2] / vm.width,
                y1 = bbox[1] / vm.height,
                y2 = bbox[3] / vm.height
            ),
            payload = {
                'class': bbox[4],
                'score': bbox[5]
            }
        )
        for frame_num, bboxes_in_frame in enumerate(maskrcnn_frame_list)
        for bbox in bboxes_in_frame
    ])
    for vm, maskrcnn_frame_list in zip(video_metadata, maskrcnn_bboxes)
})

# Load the cyclist bboxes into Rekall, using video id as key
# Units of Bounds are seconds for time, relative units for X and Y
cyclist_bboxes_ism = IntervalSetMapping({
    vm.id: IntervalSet([
        Interval(
            Bounds3D(
                t1 = frame_num / vm.fps,
                t2 = (frame_num + 1) / vm.fps,
                x1 = bbox[0] / vm.width,
                x2 = bbox[2] / vm.width,
                y1 = bbox[1] / vm.height,
                y2 = bbox[3] / vm.height
            ),
            payload = {
                'class': bbox[4],
                'score': bbox[5]
            }
        )
        for frame_num, bboxes_in_frame in enumerate(cyclist_frame_list)
        for bbox in bboxes_in_frame
    ])
    for vm, cyclist_frame_list in zip(video_metadata, cyclist_bboxes)
})

In [7]:
#CREATE LABEL OF CYCLIST BY MERGING PERSON + BICYCLE

In [8]:
object_names = ["person", "bicycle"]
object_isms = [
    maskrcnn_bboxes_ism.filter(lambda interval: interval['payload']['class'] == object_name)
    for object_name in object_names
]

In [9]:
#CREATE A CYCLIST LABEL
from rekall.predicates import *

person_ism = object_isms[0]
bicycle_ism = object_isms[1]

constructed_cyclist_bboxes = person_ism.join(
    bicycle_ism,
    predicate = and_pred(
        Bounds3D.T(equal()), # equal along the time dimension
        Bounds3D.X(overlaps()), # boxes overlap in the X dimension
        Bounds3D.Y(overlaps()) # boxes overlap in the Y dimension
    ),
    merge_op = lambda person, bicycle: Interval(
        person['bounds'].span(bicycle['bounds']), # We use the "span" method of Bounds3D to get a spanning bound
        payload = {
            'class': 'bike_person_cyclist',
        }
    ),
    window = 0.5 # choose only pairs that differ by less than half a second from each other
)



In [10]:
# Visualize Mask-RCNN bboxes, and cyclist bboxes
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('cyclist_bounding_boxes', cyclist_bboxes_ism.filter(
            lambda intrvl: intrvl['payload']['class'] == 'Cyclist')),
        ('mask_rcnn_bounding_boxes', maskrcnn_bboxes_ism.filter(
            lambda intrvl: intrvl['payload']['class'] == 'bicycle')),
        ('mask_rcnn_bounding_boxes', maskrcnn_bboxes_ism.filter(
            lambda intrvl: intrvl['payload']['class'] == 'person')),
        ('constructed_cyclist_boxes', constructed_cyclist_bboxes)
    ]),
    video_endpoint = VIDEO_COLLECTION_BASEURL
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

VGridWidget(vgrid_spec={'compressed': True, 'data': b'x\x9c\xcc\xbd\xd9\x8e,Iv$\xf8+D=\x13\x84\xee\xcb<\xce/\x…

In [99]:
#CALCULATE PRECISION + RECALL

In [11]:
#TODO -- Implement IoU validation of TP 
def tp(gt, test):
    return gt.join(
        test,
        predicate = Bounds3D.T(equal()), merge_op = lambda intrvl1, intrvl2: intrvl2, window=0.0
    ).group_by_axis(('t1', 't2'),  Bounds3D(0, 1, 0, 1, 0, 1))

In [14]:
def print_precision_recall_f1(gt, test):
    tp_result = tp(gt, test)
    false_positives = test.minus(tp_result, window=1.0)
    false_negatives = gt.minus(tp_result, window=1.0)
    ground_truth=  gt.group_by_axis(('t1', 't2'), Bounds3D(0, 1, 0, 1, 0, 1));
    
    [true_positives, total_detections_amount, number_ground_truths ] = [0, 0, 0]
    
    for video in test.keys():
        true_positives += tp_result.size()[video]
        total_detections_amount += test.size()[video]
        number_ground_truths += ground_truth.size()[video]
        
    precision = true_positives / total_detections_amount
    recall = true_positives / number_ground_truths
    
    print('Precision: {}'.format(precision))
    print('Recall: {}'.format(recall))
    print('F1: {}'.format(2 * precision * recall / (precision + recall)))
    
    return (false_positives, false_negatives)

In [15]:
[false_positives, false_negatives] = print_precision_recall_f1(
    cyclist_bboxes_ism.filter(
            lambda intrvl: intrvl['payload']['class'] == 'Cyclist'),
    constructed_cyclist_bboxes
    .group_by_axis(('t1', 't2'), Bounds3D(0, 1, 0, 1, 0, 1))
)



Precision: 0.9753401360544217
Recall: 0.5969294821753838
F1: 0.740597255851493


In [16]:
#Visualize false negatives
vgrid_spec = VGridSpec(
    video_meta = video_metadata,
    vis_format = VideoBlockFormat(imaps = [
        ('false_negatives_gt', false_negatives),
        ('mask_rcnn_bounding_boxes', maskrcnn_bboxes_ism.filter(
            lambda intrvl: intrvl['payload']['class'] == 'bicycle')),
        ('mask_rcnn_bounding_boxes', maskrcnn_bboxes_ism.filter(
            lambda intrvl: intrvl['payload']['class'] == 'person')),
        ('constructed_cyclist_boxes', constructed_cyclist_bboxes)
        
    ]),
    video_endpoint = VIDEO_COLLECTION_BASEURL
)
VGridWidget(vgrid_spec = vgrid_spec.to_json_compressed())

VGridWidget(vgrid_spec={'compressed': True, 'data': b'x\x9c\xcc\xbd\xdb\xb2e\xc9q]\xf9+2<\xcbhq\xbf\xf4c\xffB\…

In [76]:
#TODO: calculate mAP
