In [None]:
from IPython.display import display, clear_output
import ipywidgets as widgets
from collections import Counter, namedtuple
import sys
import math
import pickle
from datetime import datetime
from pytz import timezone
import numpy as np
import matplotlib.pyplot as plt

print('Initializing notebook. Please wait...', file=sys.stderr)

import esper.captions as captions
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS
from esper.widget import *
from esper.rekall import *
from rekall.interval_list import IntervalList

WIDGET_STYLE_ARGS = {'description_width': 'initial'}

ANCHOR_WORD_WINDOW_SIZE = 15
CONTEXT_WORD_EXTEND_THRESH = 120


def merge_postings(p1, p2):
    start_idx = min(p1.idx, p2.idx)
    end_idx = max(p1.idx + p1.len, p2.idx + p2.len)
    return p1._replace(
        start=min(p1.start, p2.start), 
        end=max(p1.end, p2.end), 
        idx=start_idx,
        len=end_idx - start_idx)

def extend_postings(postings, threshold):
    merged = []
    curr_p = None
    for p in postings:
        if curr_p is None:
            curr_p = p
        elif p.start >= curr_p.start and p.start - curr_p.end <= threshold:
            curr_p = merge_postings(curr_p, p)
        else:
            merged.append(curr_p)
            curr_p = p
    else:
        merged.append(curr_p)
    return merged

def extend_postings_with_context(anchors, contexts, threshold):
    results = []
    for anchor_p in anchors:
        for context_p in contexts:
            if context_p.start >= anchor_p.start and context_p.start - anchor_p.end <= threshold:
                anchor_p = merge_postings(anchor_p, context_p)
        for context_p in contexts[::-1]:
            if context_p.start <= anchor_p.start and anchor_p.start - context_p.end <= threshold:
                anchor_p = merge_postings(anchor_p, context_p)
        results.append(anchor_p)
    return extend_postings(results, threshold)


TopicSegments = namedtuple('TopicSegments', [
    'video_to_segments', 'video_to_anchor_words', 'video_to_context_words'
])


def find_segments(anchor_words, context_words):
    print('Searching for segments...'.format(len(anchor_words), len(context_words)), 
          file=sys.stderr)
    
    # Find the anchor locations
    video_anchor_locations = {}
    for d in captions.topic_search(list(anchor_words), window_size=ANCHOR_WORD_WINDOW_SIZE):
        video_anchor_locations[d.id] = list(d.postings)
    
    # Search for context locations
    video_context_locations = {}
    for d in captions.topic_search(list(context_words), window_size=0, 
                                   video_ids=video_anchor_locations.keys()):
        video_context_locations[d.id] = list(d.postings)
    
    # Extend the anchor locations
    video_topic_segments = {}
    for video_id, anchor_postings in video_anchor_locations.items():
        video_topic_segments[video_id] = extend_postings_with_context(
            anchor_postings, video_context_locations.get(video_id, []),
            CONTEXT_WORD_EXTEND_THRESH)
    
    coverage_seconds = sum(sum(p.end - p.start for p in l) for l in video_topic_segments.values())
    print('Found {} segments in {} videos covering {:0.2f} minutes.'.format(
        sum(len(l) for l in video_topic_segments.values()),
        len(video_topic_segments),
        coverage_seconds / 60
    ), file=sys.stderr)
    return TopicSegments(video_topic_segments, video_anchor_locations, video_context_locations)

MIN_TOKEN_COUNT = 10000

def propose_context_words(topic_result, k=192, ncols=8, default_threshold=3.):
    topic_word_counts = Counter()
    for video_id, segments in topic_result.video_to_segments.items():
        d = captions.get_document(video_id)
        for p in segments:
            topic_word_counts.update(captions.INDEX.tokens(d, p.idx, p.len))

    all_words_total = sum(w.count for w in captions.LEXICON)
    topic_words_total = sum(topic_word_counts.values())
    
    def filter_cond(t):
        if t not in captions.LEXICON: 
            return False
        w = captions.LEXICON[t]
        return w.count > MIN_TOKEN_COUNT and w.token not in CONTEXT_WORDS

    const_expr = math.log(all_words_total) - math.log(topic_words_total) 
    log_pmis = [
        (t, math.log(topic_word_counts[t]) - math.log(captions.LEXICON[t].count) + const_expr)
        for t in topic_word_counts.keys() if filter_cond(t)
    ]
    log_pmis.sort(key=lambda x: -x[1])
    log_pmis = log_pmis[:k]
    
    selections = []
    for t, score in log_pmis:
        token = captions.LEXICON[t].token
        w = widgets.ToggleButton(
            value=score >= default_threshold,
            description=token,
            disabled=False,
            button_style='',
            icon=''
        )
        selections.append((t, w))
    
    submit_button = widgets.Button(
        description='Submit',
        disabled=False,
        button_style='danger'
    )
    def on_submit(b):
        selected_words = []
        for t, w in selections:
            if w.value == True:
                selected_words.append(captions.LEXICON[t].token)
        clear_output()
        print('Added {} words to the context.'.format(len(selected_words)))
        
        global CONTEXT_WORDS
        CONTEXT_WORDS.update(selected_words)
        sync_context_widget()
    
    submit_button.on_click(on_submit)
    
    hboxes = []
    for i in range(0, len(selections), ncols):
        hboxes.append(widgets.HBox([w for _, w in selections[i:i + ncols]]))
    vbox = widgets.VBox(hboxes)
    display(widgets.HBox([
        widgets.Label(
            'Instructions: Select new context words and hit submit. '
            '(Likely words may already be highlighted.) '),
        submit_button
    ]))
    display(vbox)
    
def display_segments(topic_results, filters={}, limit=1000, results_per_page=50):
    video_to_topic_time = {
        video_id : sum(p.end - p.start for p in postings)
        for video_id, postings in topic_results.video_to_segments.items()
    }
    video_qs = Video.objects.filter(id__in=list(video_to_topic_time.keys()), duplicate=False)
    if 'show' in filters:
        video_qs = video_qs.filter(show__canonical_show__name=filters['show'])
    if 'channel' in filters:
        video_qs = video_qs.filter(channel__name=filters['channel'])
    if 'start' in filters:
        video_qs = video_qs.filter(time__gte=filters['start'])
    if 'end' in filters:
        video_qs = video_qs.filter(time__lte=filters['end'])
    video_to_fps = {
        v['id']: v['fps'] for v in video_qs.values('id', 'fps', 'channel__name')
    }
    if len(video_to_fps) == 0:
        print('No videos to display', file=sys.stderr)
        return
    video_to_topic_time = {k: v for k, v in video_to_topic_time.items() if k in video_to_fps}
    limit_video_ids = set(sorted(video_to_fps, key=lambda x: -video_to_topic_time[x])[:limit])
    
    def convert_time(v, t):
        return int(t * video_to_fps[v])
    def to_intervallist(video_to_postings):
        return {
            video_id : IntervalList([
                (convert_time(video_id, p.start), convert_time(video_id, p.end), None)
                for p in postings
            ]) 
            for video_id, postings in video_to_postings.items() 
            if video_id in limit_video_ids
        }
    
    # Plot distribution of topic times in videos
    def plot_dist_of_videos(results_per_page):
        plt.figure(figsize=(7,2))
        x = np.arange(len(video_to_topic_time))
        y = np.array(sorted(video_to_topic_time.values(), key=lambda x: -x)) / 60
        plt.plot(x, y, color='red')
        plt.fill_betweenx([0, np.max(y)], len(limit_video_ids), alpha=0.2, color='gray')
        plt.ylabel('Minutes')
        plt.ylim(0, np.max(y))
        plt.xlabel('Video Number')
        plt.xlim(0, len(video_to_topic_time))
        plt.show()

    print('Videos (ordered by descending segment time)')
    plot_dist_of_videos(results_per_page)
    print('Loading {} of {} videos (shaded region)... Please wait.'.format(
        len(limit_video_ids), len(video_to_topic_time)))
    
    # Convert to intervallists
    video_to_topic_intervals = to_intervallist(topic_results.video_to_segments)
    video_to_anchor_intervals = to_intervallist(topic_results.video_to_anchor_words)
    video_to_context_intervals = to_intervallist({
        k: extend_postings(v, CONTEXT_WORD_EXTEND_THRESH) 
        # Coalesce context words to reduce memory usage
        for k, v in topic_results.video_to_context_words.items()
    })
    video_to_commerical_intervals = qs_to_intrvllists(
        Commercial.objects.filter(labeler__name='haotian-commercials',
                                  video__id__in=limit_video_ids))
    
    # Display results
    result = intrvllists_to_result(
        video_to_topic_intervals, color='red',
        video_order=list(sorted(
            video_to_topic_intervals,
            key=lambda x: -video_to_topic_time[x])))
    add_intrvllists_to_result(result, video_to_anchor_intervals, color='blue')
    add_intrvllists_to_result(result, video_to_context_intervals, color='orange')
    add_intrvllists_to_result(result, video_to_commerical_intervals, color='black')
    
    display(esper_widget(result, jupyter_keybindings=True, results_per_page=results_per_page))
    
def is_uniformly_sampled(is_3y, fps, frame_number):
    if is_3y:
        return frame_number % math.floor(fps * 3) == 0
    else:
        return frame_number % math.ceil(fps * 3) == 0
    
def analysis(topic_results, n=10000):
    video_to_meta = {
        v['id']: {
            'channel': v['channel__name'],
            'show': v['show__canonical_show__name'],
            'time': v['time'],
            'fps': v['fps'],
            'is_3y': v['threeyears_dataset'],
            'path': v['path']
        } for v in Video.objects.filter(
            id__in=list(topic_results.video_to_segments.keys()), 
            duplicate=False
        ).values(
            'id', 'channel__name', 'show__canonical_show__name', 'time', 'fps',
            'threeyears_dataset', 'path'
        )
    }
    channels = [c.name for c in Channel.objects.all()]
    utc = timezone('UTC')
    eastern = timezone('US/Eastern')
    
    channel_to_time = {c: 0. for c in channels}
    channel_to_daypart_to_time = {c: np.zeros(24) for c in channels}
    channel_to_weekday_to_time = {c: np.zeros(7) for c in channels}
    channel_to_time_to_time = {c: defaultdict(float) for c in channels}
    show_to_time = Counter()
    for video_id, postings in topic_results.video_to_segments.items():
        if video_id not in video_to_meta:
            continue

        video_topic_len = sum(p.end - p.start for p in postings)
        channel = video_to_meta[video_id]['channel']
        channel_to_time[channel] += video_topic_len
        
        video_dt = utc.localize(video_to_meta[video_id]['time']).astimezone(eastern)
        for p in postings:
            base_hour = video_dt.hour
            posting_len = p.end - p.start
            channel_to_daypart_to_time[channel][
                (base_hour + int(p.start / 3600)) % 24
            ] += posting_len
            
        channel_to_weekday_to_time[channel][video_dt.weekday()] += video_topic_len
        channel_to_time_to_time[channel][video_dt.date()] += video_topic_len
        
        show = video_to_meta[video_id]['show']
        show_to_time[(channel, show)] += video_topic_len
        
    print('Topic time by channel:')
    for c in channel_to_time:
        print('  {}: {:0.3f} hours'.format(c, channel_to_time[c] / 3600))
        
    print('\nTopic time by daypart:')
    def plot_daypart():
        plt.figure(figsize=(11,3))
        bar_width = 1 / (len(channels) + 1)
        for i, c in enumerate(channels):
            plt.bar(np.arange(24) + (i - 1) * bar_width,
                    channel_to_daypart_to_time[c] / 60, 
                    width=bar_width, alpha=0.5, label=c)
        plt.xticks(np.arange(24))
        plt.legend()
        plt.ylabel('Minutes')
        plt.xlabel('Hour of Day')
        plt.show()
    plot_daypart()
    
    print('\nTopic time by weekday:')
    def plot_weekday():
        plt.figure(figsize=(11,3))
        bar_width = 1 / (len(channels) + 1)
        for i, c in enumerate(channels):
            plt.bar(np.arange(7) + (i - 1) * bar_width, 
                    channel_to_weekday_to_time[c] / 60, 
                    width=bar_width, alpha=0.5, label=c)
        plt.xticks(np.arange(7), ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
        plt.legend()
        plt.ylabel('Minutes')
        plt.xlabel('Weekday')
        plt.show()
    plot_weekday()
    
    print('\nTopic time by day:')
    def plot_timeline():
        plt.figure(figsize=(11,3))
        bar_width = 1 / (len(channels) + 1)
        for c in channels:
            data = [x for x in sorted(channel_to_time_to_time[c].items())]
            plt.scatter(
                [x for x, _ in data], [y / 60 for _, y in data],
                alpha=0.5, s=2, label=c)
        plt.legend()
        plt.ylabel('Minutes')
        plt.xlabel('Day')
        plt.show()
    plot_timeline()
    
    top_n = 10
    print('\nShows with most coverage (top-{}):'.format(top_n))
    for (channel, show), seconds in show_to_time.most_common(top_n):
        print('  {} ({}): {:0.1f} minutes'.format(show, channel, seconds / 60))
        
    # Time by Gender
    print('Loading face genders...', file=sys.stderr)
    print('Not implemented...', file=sys.stderr)
#     video_to_face_genders = defaultdict(list)
#     face_gender_qs = FaceGender.objects.filter(
#         labeler=Labeler.objects.get(name='knn-gender'), 
#         face__frame__video__id__in=list(video_to_meta.keys()),
#     )
#     for f in face_gender_qs.values(
#         'gender__name', 'face__frame__video__id', 'face__frame__number', 'probability'
#     ).order_by('?')[:n]:
#         video_id = f['face__frame__video__id']
#         fps = video_to_meta[video_id]['fps']
#         is_3y = video_to_meta[video_id]['is_3y']
#         frame_number = f['face__frame__number']
#         if not is_uniformly_sampled(is_3y, fps, frame_number):
#             continue
#         start = frame_number / fps
#         video_to_face_genders[video_id].append({
#             'label': f['gender__name'],
#             'probability': f['probability'],
#             'start': start,
#             'end': start + 3
#         })
        
    # Time by Identity
    print('Loading face identities...', file=sys.stderr)
    print('Not implemented...', file=sys.stderr)
#     ident_labelers = [
#         l['id'] for l in Labeler.objects.filter(
#             name__contains='face-identity'
#         ).exclude(name__contains='face-identity-old:').values('id')
#     ]
#     video_to_face_idents = defaultdict(list)
#     face_ident_qs = FaceIdentity.objects.filter(
#         labeler__id__in=ident_labelers, 
#         face__frame__video__id__in=list(video_to_meta.keys()),
#         probability__gt=0.5
#     )
#     for f in face_ident_qs.values(
#         'identity__name', 'face__frame__video__id', 'face__frame__number', 'probability'
#     ).order_by('?')[:n]:
#         video_id = f['face__frame__video__id']
#         fps = video_to_meta[video_id]['fps']
#         is_3y = video_to_meta[video_id]['is_3y']
#         frame_number = f['frame__number']
#         if not is_uniformly_sampled(is_3y, fps, frame_number):
#             continue
#         start = frame_number / fps
#         video_to_face_idents[video_id].append({
#             'label': f['identity__name'],
#             'probability': f['probability'],
#             'start': start,
#             'end': start + 3
#         })

print('Done initializing notebook.', file=sys.stderr)

# Stories from a Lexicon

Stories are retreived via lexicons of words. Story lexicons have two components, <b>anchor phrases</b> and <b>context phrases</b>.

<b>Anchor phrases</b> are phrases that must appear for a segment to be considered a part of a story and should be unique to the story. For instance, a segment about 'Hurricane Irma' must mention either 'Irma' or 'Hurricane Irma'.

<b>Context phrases</b> are phrases that are used to describe the story and are used to determine that time extent of a topic segment. For instance, words such as 'devastation' and 'storm' around an anchor such as 'Hurricane Irma' indicate that the discussion is part of the story. These phrases need not be unique to the story.

<b>Instructions:</b>
- Enter anchor phrases to start (required)
- Enter a few context phrases (optional, but recommended)
- Hit 'search for segments'

In [None]:
try:
    ANCHOR_WORDS
    CONTEXT_WORDS
    TOPIC_SEGMENTS
except NameError:
    ANCHOR_WORDS = set()
    CONTEXT_WORDS = set()
    TOPIC_SEGMENTS = None

status_output = widgets.Output()
anchor_widget = widgets.Textarea(
    style=WIDGET_STYLE_ARGS,
    value='',
    layout=widgets.Layout(width='100%'),
    placeholder='Phrases (one per line)',
    description='Anchor phrases:',
    disabled=False
)
def sync_anchor_widget():
    anchor_widget.value = '\n'.join(sorted(ANCHOR_WORDS))
    anchor_widget.layout = widgets.Layout(
        width='100%', 
        height='{}px'.format(20 * (len(ANCHOR_WORDS) + 2))
    )
def on_anchor_changed(b):
    with status_output:
        clear_output()
        try:
            global ANCHOR_WORDS
            ANCHOR_WORDS = {t.strip() for t in anchor_widget.value.split('\n') if len(t.strip()) > 0}
        except Exception as e:
            print(e)
anchor_widget.observe(on_anchor_changed, names='value')

context_widget = widgets.Textarea(
    value='',
    style=WIDGET_STYLE_ARGS,
    layout=widgets.Layout(width='100%'),
    placeholder='Phrases (one per line)',
    description='Context phrases:',
    disabled=False
)
def sync_context_widget():
    context_widget.value = '\n'.join(sorted(CONTEXT_WORDS))
    context_widget.layout = widgets.Layout(
        width='100%', 
        height='{}px'.format(20 * (len(CONTEXT_WORDS) + 2))
    )
def on_context_changed(b):
    with status_output:
        clear_output()
        try:
            global CONTEXT_WORDS
            CONTEXT_WORDS = {t.strip() for t in context_widget.value.split('\n') if len(t.strip()) > 0}
        except Exception as e:
            print(e)
context_widget.observe(on_context_changed, names='value')

sort_button = widgets.Button(
    description='Sort phrases',
    disabled=False,
    button_style=''
)
def on_sort(b):
    sync_anchor_widget()
    sync_context_widget()
sort_button.on_click(on_sort)

search_button = widgets.Button(
    description='Search for segments',
    disabled=False,
    button_style='danger'
)
def on_search(b):
    with status_output:
        clear_output()
        global TOPIC_SEGMENTS
        TOPIC_SEGMENTS = find_segments(ANCHOR_WORDS, CONTEXT_WORDS)
search_button.on_click(on_search)
        
display(anchor_widget)
display(context_widget)
display(widgets.HBox([sort_button, search_button]))
display(status_output)
sync_anchor_widget()
sync_context_widget()

Running `display_segments()` shows the retreived topic segments with a timeline. You must have hit 'search for segments' prior to running this.

Timeline colors:
- Red = topic segment
- Blue = anchor phrases
- Orange = context phrases
- Grey = commercia

Videos will be ordered by descending amount of time identified as the story.

In [None]:
def show_filter_widgets():
    channel_filter_button = widgets.Dropdown(
        style=WIDGET_STYLE_ARGS,
        options=['All', 'CNN', 'FOXNEWS', 'MSNBC'],
        value='All',
        description='Channel:',
        disabled=False,
    )
    canonical_show_dropdown = widgets.Dropdown(
        style=WIDGET_STYLE_ARGS,
        options=['All'] + list(sorted(MAJOR_CANONICAL_SHOWS)),
        value='All',
        description='Show:',
        disabled=False,
    )
    start_date_picker = widgets.DatePicker(
        style=WIDGET_STYLE_ARGS,
        description='Start date:',
        disabled=False
    )
    end_date_picker = widgets.DatePicker(
        style=WIDGET_STYLE_ARGS,
        description='End date:',
        disabled=False
    )
    global FILTER_WIDGETS
    FILTER_WIDGETS = {
        'show': canonical_show_dropdown,
        'channel': channel_filter_button,
        'start_date': start_date_picker,
        'end_date': end_date_picker
    }
    display(widgets.HBox([
        channel_filter_button, canonical_show_dropdown, 
        start_date_picker, end_date_picker]))
    
def get_filters():
    filters = {}
    show = FILTER_WIDGETS['show'].value
    if show != 'All':
        filters['show'] = show
    channel = FILTER_WIDGETS['channel'].value
    if channel != 'All':
        filters['channel'] = channel
    if FILTER_WIDGETS['start_date'].value:
        filters['start'] = FILTER_WIDGETS['start_date'].value
    if FILTER_WIDGETS['end_date'].value:
        filters['end'] = FILTER_WIDGETS['end_date'].value 
    return filters

def show_video_controls():
    show_videos_output = widgets.Output()
    limit_slider = widgets.BoundedIntText(
        style=WIDGET_STYLE_ARGS,
        value=1000,
        min=1,
        max=10000,
        description='Video limit:',
        disabled=False,
    )
    results_per_page_slider = widgets.BoundedIntText(
        style=WIDGET_STYLE_ARGS,
        value=25,
        min=1,
        max=100,
        description='Results per page:',
        disabled=False,
    )
    show_videos_button = widgets.Button(
        style=WIDGET_STYLE_ARGS,
        description='Show videos',
        disabled=False,
        button_style='danger'
    )
    def on_show_videos(b):
        with show_videos_output:
            clear_output()
            display_segments(
                TOPIC_SEGMENTS, get_filters(), 
                limit=limit_slider.value,
                results_per_page=results_per_page_slider.value)
    show_videos_button.on_click(on_show_videos)
    clear_videos_button = widgets.Button(
        style=WIDGET_STYLE_ARGS,
        description='Dismiss videos',
        disabled=False,
        button_style=''
    )
    def on_clear_videos(b):
        with show_videos_output:
            clear_output()
    clear_videos_button.on_click(on_clear_videos)
    display(widgets.HBox([
        show_videos_button, clear_videos_button,
        limit_slider, results_per_page_slider]))
    display(show_videos_output)

show_filter_widgets()
show_video_controls()

Once we have some segments corresponding to the lexicon, we can use NLP to propose new context words to improve story coverage. `propose_context_words()` will use statistics to suggest new lexicon words.

In [None]:
propose_context_words(TOPIC_SEGMENTS)

# Compute Statistics

`analysis()` will compute statistics over the story segments retreived.

In [None]:
analysis(TOPIC_SEGMENTS)

Load a debugging lexicon...

In [None]:
ANCHOR_WORDS = {
    'HURRICANE IRMA', 'IRMA'
}
CONTEXT_WORDS = { 
    'ADVISORY', 'ATLANTIC', 'BANDS', 'BEACH', 'BOATS', 'BRACING', 'BRIDGES',
    'CARIBBEAN', 'CATASTROPHIC', 'CATEGORY', 'CLEANUP', 'COAST', 'COASTAL',
    'CUBA', 'DAMAGE', 'DEBRIS', 'DESTRUCTION', 'DESTRUCTIVE', 'DEVASTATED',
    'DEVASTATING', 'DEVASTATION', 'DISASTERS', 'DOWNTOWN', 'ELECTRICITY',
    'EVACUATE', 'EVACUATED', 'EVACUATION', 'EVACUATIONS', 'FEMA', 'FLOOD',
    'FLOODED', 'FLOODING', 'FLORIDA', 'FORECAST', 'GUSTS', 'HARVEY', 'HURRICANE',
    'HURRICANES', 'IMPACTED', 'IMPACTS', 'INTENSITY', 'IRMA', 'ISLAND', 'ISLANDS',
    'JOSE', 'KEYS', 'LANDFALL', 'MANDATORY', 'METEOROLOGIST', 'MIAMI', 'MONSTER',
    'MYERS', 'NURSING', 'ORLANDO', 'OUTAGES', 'OUTER', 'PALM', 'POWER',
    'PREPARATION', 'PUERTO', 'RAIN', 'RAINFALL', 'RAINS', 'REBUILD',
    'RESPONDERS', 'RESTORED', 'RICO', 'SHELTER', 'SHELTERS', 'STORM',
    'STORMS', 'STRONGEST', 'SUPPLIES', 'SURGE', 'SUSTAINED', 'TAMPA',
    'TIDE', 'TREES', 'TROPICAL', 'WARNINGS', 'WATER', 'WAVES', 'WIND', 'WINDS'
}
sync_context_widget()
sync_anchor_widget()
on_search(None)