<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Introduction-Detector" data-toc-modified-id="Introduction-Detector-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Introduction Detector</a></span></li></ul></div>

In [None]:
# Imports. Run this first!

from query.models import LabeledInterview, Video, FaceIdentity
from esper.rekall import *
from rekall.temporal_predicates import *
from rekall.spatial_predicates import *
from rekall.interval_list import IntervalList, Interval
from esper.prelude import esper_widget
from esper.captions import scan_for_ngrams_in_parallel

sandbox_videos = [529, 763, 2648, 3459, 3730, 3769, 3952, 4143, 4611, 5281, 6185, 7262, 8220,
    8697, 8859, 9215, 9480, 9499, 9901, 10323, 10335, 11003, 11555, 11579, 11792,
    12837, 13058, 13141, 13247, 13556, 13827, 13927, 13993, 14482, 15916, 16215,
    16542, 16693, 16879, 17458, 17983, 19882, 19959, 20380, 20450, 23181, 23184,
    24193, 24847, 24992, 25463, 26386, 27188, 27410, 29001, 31378, 32472, 32996,
    33004, 33387, 33541, 33800, 34359, 34642, 36755, 37107, 37113, 37170, 38275,
    38420, 40203, 40856, 41480, 41725, 42756, 45472, 45645, 45655, 45698, 48140,
    49225, 49931, 50164, 50561, 51175, 52075, 52749, 52945, 53355, 53684, 54377,
    55711, 57384, 57592, 57708, 57804, 57990, 59122, 59398, 60186]

# Introduction Detector

We're going to write a query to look for introductions of guests and compare it to our labeled interviews from our sandbox. We'll look for the text "JOINING US NOW" and join that with intervals of a host on screen by themselves followed by a non-host on screen by themselves, or a non-host followed by a host, or a host on screen with a non-host at the same time.

In [None]:
# Ground-truth interviews

interviews = LabeledInterview.objects \
        .annotate(fps=F('video__fps')) \
        .annotate(min_frame=F('fps') * F('start')) \
        .annotate(max_frame=F('fps') * F('end')) \
        .filter(original=True)
interviews_intrvllists = qs_to_intrvllists(interviews)

In [None]:
# Returns precision, recall, precision_per_item, recall_per_item
def compute_statistics(query_intrvllists, ground_truth_intrvllists):
    total_query_time = 0
    total_query_segments = 0
    total_ground_truth_time = 0
    total_ground_truth_segments = 0
    
    for video in query_intrvllists:
        total_query_time += query_intrvllists[video].coalesce().get_total_time()
        total_query_segments += query_intrvllists[video].size()
    for video in ground_truth_intrvllists:
        total_ground_truth_time += ground_truth_intrvllists[video].coalesce().get_total_time()
        total_ground_truth_segments += ground_truth_intrvllists[video].size()
        
    total_overlap_time = 0
    overlapping_query_segments = 0
    overlapping_ground_truth_segments = 0
    
    for video in query_intrvllists:
        if video in ground_truth_intrvllists:
            query_list = query_intrvllists[video]
            gt_list = ground_truth_intrvllists[video]
            
            total_overlap_time += query_list.overlaps(gt_list).coalesce().get_total_time()
            overlapping_query_segments += query_list.filter_against(gt_list, predicate=overlaps()).size()
            overlapping_ground_truth_segments += gt_list.filter_against(query_list, predicate=overlaps()).size()
    
    if total_query_time == 0:
        precision = 1.0
        precision_per_item = 1.0
    else:
        precision = total_overlap_time / total_query_time
        precision_per_item = overlapping_query_segments / total_query_segments
    
    if total_ground_truth_time == 0:
        recall = 1.0
        recall_per_item = 1.0
    else:
        recall = total_overlap_time / total_ground_truth_time
        recall_per_item = overlapping_ground_truth_segments / total_ground_truth_segments
    
    return precision_per_item, recall_per_item

def print_statistics(query_intrvllists, ground_truth_intrvllists):
    precision_per_item, recall_per_item = compute_statistics(
        query_intrvllists, ground_truth_intrvllists)

    print("Precision Per Item: ", precision_per_item)
    print("Recall Per Item: ", recall_per_item)

In [None]:
# Host face bboxes

identities = FaceIdentity.objects.filter(face__shot__video_id__in=sandbox_videos)
hosts = identities.filter(face__is_host=True) \
    .annotate(video_id=F("face__shot__video_id")) \
    .annotate(shot_id=F("face__shot_id")) \
    .annotate(min_frame=F("face__shot__min_frame")) \
    .annotate(max_frame=F("face__shot__max_frame")) \
    .annotate(bbox_x1=F("face__bbox_x1")) \
    .annotate(bbox_y1=F("face__bbox_y1")) \
    .annotate(bbox_x2=F("face__bbox_x2")) \
    .annotate(bbox_y2=F("face__bbox_y2"))

vids = {}
for face in hosts:
    video_id = face.video_id
    shot_id = face.shot_id
    if video_id not in vids:
        vids[video_id] = {}
    if shot_id not in vids[video_id]:
        vids[video_id][shot_id] = {'min_frame': face.min_frame, 'max_frame': face.max_frame, 'objects': []}
    new_face = {'x1': face.bbox_x1, 'y1': face.bbox_y1, 'x2': face.bbox_x2, 'y2': face.bbox_y2}
    if new_face not in vids[video_id][shot_id]['objects']:
        vids[video_id][shot_id]['objects'].append(
            {'x1': face.bbox_x1, 'y1': face.bbox_y1, 'x2': face.bbox_x2, 'y2': face.bbox_y2})

host_intrvllists = {}
for video in vids:
    host_intrvllists[video] = IntervalList([(
        shot['min_frame'], 
        shot['max_frame'],
        {
            'type': 'bbox_list',
            'objects': shot['objects']
        }) for shot in list(vids[video].values())])

In [None]:
# All faces, in IntervalList form

# Let's test out some of the spatial predicates by reproducing the panels query
from query.models import Face
faces = Face.objects.annotate(video_id=F('shot__video_id')) \
        .filter(video_id__in=sandbox_videos) \
        .annotate(min_frame=F('shot__min_frame')) \
        .annotate(max_frame=F('shot__max_frame'))
vids = {}
for face in faces:
    video_id = face.video_id
    shot_id = face.shot_id
    if video_id not in vids:
        vids[video_id] = {}
    if shot_id not in vids[video_id]:
        vids[video_id][shot_id] = {'min_frame': face.min_frame, 'max_frame': face.max_frame, 'objects': []}
    new_face = {'x1': face.bbox_x1, 'y1': face.bbox_y1, 'x2': face.bbox_x2, 'y2': face.bbox_y2}
    if new_face not in vids[video_id][shot_id]['objects']:
        vids[video_id][shot_id]['objects'].append(
            {'x1': face.bbox_x1, 'y1': face.bbox_y1, 'x2': face.bbox_x2, 'y2': face.bbox_y2})

faces_intrvllists = {}
for video in vids:
    faces_intrvllists[video] = IntervalList([(
        shot['min_frame'], 
        shot['max_frame'],
        {
            'type': 'bbox_list',
            'objects': shot['objects']
        }) for shot in list(vids[video].values())])

In [None]:
# All intervals with "JOINING US NOW"
terms = [
    'JOINING US', 
    'JOINING ME', 
    'HERE WITH ME', 
    'JOINS US',
    'JOINS ME',
    'FOR BEING HERE',
    'SITTING TWO FEET AWAY',
    'BRING IN',
    'JOINED NOW'
]
captions = {}
scan_results = caption_scan_to_intrvllists(
    scan_for_ngrams_in_parallel(terms, video_ids=sandbox_videos),
    terms, video_ids=sandbox_videos, dilation=10
)
for result in scan_results:
    for video in result:
        if video in captions:
            captions[video] = captions[video].set_union(result[video]).coalesce()
        else:
            captions[video] = result[video]

In [None]:
# Statistics for just captions
print_statistics(captions, interviews_intrvllists)

In [None]:
anti_terms = [
    'PANEL',
    'CORRESPONDENT',
    'ANALYST'
]
anti_captions = {}
for result in caption_scan_to_intrvllists(
    scan_for_ngrams_in_parallel(anti_terms, video_ids=sandbox_videos),
    anti_terms, video_ids=sandbox_videos, dilation=30):
    for video in result:
        if video in anti_captions:
            anti_captions[video] = anti_captions[video].set_union(result[video]).coalesce()
        else:
            anti_captions[video] = result[video]

In [None]:
# Statistics for captions minus anti-captions
captions_without_anti_captions = {}
for video in captions:
    if video not in anti_captions:
        captions_without_anti_captions[video] = captions[video]
    else:
        captions_without_anti_captions[video] = captions[video].minus(
            captions[video].filter_against(anti_captions[video], predicate=overlaps())
        )
print_statistics(captions_without_anti_captions, interviews_intrvllists)

In [None]:
# for debugging later
one_host_lists = {}
non_host_alone_lists = {}
host_with_guest_lists = {}
host_alone_lists = {}

# Introductions
introductions = {}
for video in captions:
    if (video not in list(host_intrvllists.keys()) or
        video not in list(faces_intrvllists.keys())):
        continue
    caption = captions[video]
    hosts = host_intrvllists[video]
    faces = faces_intrvllists[video]

    one_host = hosts.filter(exactly(1)).coalesce()
    non_host_alone = faces.filter(exactly(1)).minus(one_host).dilate(30).coalesce().dilate(-30)
    host_with_guest = faces.filter(exactly(2)).overlaps(one_host)
    host_alone = one_host.overlaps(faces.filter(exactly(1)))
    
    one_host_lists[video] = one_host
    non_host_alone_lists[video] = non_host_alone
    host_with_guest_lists[video] = host_with_guest
    host_alone_lists[video] = host_alone
    
    intros = caption.merge(
        host_with_guest.set_union(
            host_alone.merge(
                non_host_alone,
                predicate=or_pred(before(max_dist=30), after(max_dist=30)))
        ).dilate(30),
        predicate=overlaps()
    ).coalesce().map(
        lambda intrvl: Interval(intrvl.start, intrvl.end + 6000, intrvl.payload)
    )
    
    if video in anti_captions:
        intros = intros.minus(intros.filter_against(anti_captions[video], predicate=overlaps()))
    
    if intros.size() > 0:
        introductions[video] = intros

In [None]:
# add visual filters
print_statistics(introductions, interviews_intrvllists)

Let's look at some of these results in depth. You can use the following code to comment or uncomment various different intermediate results used throughout our query. By default, we have our query results in black, ground truth in red, and the caption results in orange.

In [None]:
result = intrvllists_to_result(introductions, color='black')
add_intrvllists_to_result(result, interviews_intrvllists, color='red')
# add_intrvllists_to_result(result, host_intrvllists, color='blue')
# add_intrvllists_to_result(result, one_host_lists, color='purple')
# add_intrvllists_to_result(result, non_host_alone_lists, color='green')
add_intrvllists_to_result(result, captions, color='orange')
# add_intrvllists_to_result(result, anti_captions, color='yellow')
esper_widget(result)

Our visual filtering removes some cases with multiple interviewers, which lowers our recall. But it also removes some other false positives, particularly of pa