<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Choose-a-Topic" data-toc-modified-id="Choose-a-Topic-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Choose a Topic</a></span></li><li><span><a href="#Analysis" data-toc-modified-id="Analysis-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Analysis</a></span><ul class="toc-item"><li><span><a href="#Compare-screen-time-across-the-entire-dataset" data-toc-modified-id="Compare-screen-time-across-the-entire-dataset-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Compare screen time across the entire dataset</a></span></li><li><span><a href="#Compare-screen-time-by-show" data-toc-modified-id="Compare-screen-time-by-show-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Compare screen time by show</a></span><ul class="toc-item"><li><span><a href="#Including-hosts" data-toc-modified-id="Including-hosts-2.2.1"><span class="toc-item-num">2.2.1&nbsp;&nbsp;</span>Including hosts</a></span></li><li><span><a href="#Excluding-hosts" data-toc-modified-id="Excluding-hosts-2.2.2"><span class="toc-item-num">2.2.2&nbsp;&nbsp;</span>Excluding hosts</a></span></li></ul></li></ul></li></ul></div>

In [None]:
from esper.prelude import *
from esper.stdlib import *
from esper.topics import *
from esper.spark_util import *
from esper.plot_util import *
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

from datetime import timedelta
from collections import defaultdict
import _pickle as pickle

# Choose a Topic

In [None]:
topic = 'terrorism'

In [None]:
lexicon = mutual_info(topic)

In [None]:
for word, _ in lexicon:
    print(word)

In [None]:
selected_words = """
be terrorism
domestic terrorism
terrorism be
terrorism
fight terrorism
islamic terrorism
radical islamic terrorism
counter terrorism
state sponsor
radical islamic
sponsor
homegrown
islamic
task force
sponsors
radical
counter
extremism
# nance
domestic
violent extremism
eradicate
# obsolete
islamist
# mudd
# mechanical
nato be
sponsored
# qatar
terrorize
hate crime
# task
egyptair
# be rule
radicalization
keep america safe
radical islam
jihadist
malcolm
lone wolf
watch list
# refuses
special agent
# workplace
security analyst
extremist
destroy isis
farook
syed
terrorist organization
bernardino
san bernardino
# inspired
islam be
keep america
# designate
a terrorist
islam
hamas
are investigate
links
egyptian
iran be
# defeat
muslim country
do appear
terrorist
terrorist groups
# nypd
hezbollah
lone
terrorist attacks
terrorists
motive
muslim community
counterterrorism
radicalized
terrorist attack
# former cia
fight against
jihadi
# france be
# definition
jihad
america safe
extremists
egypt
grown
export
nato
war against
combat
munich
threat
# fbi say
# related
# soil
chattanooga
# act
# analyst
"""

In [None]:
selected_words_set = set()
for line in selected_words.split('\n'):
    line = line.strip()
    if line == '' or line[0] == '#':
        continue
    selected_words_set.add(line)
filtered_lexicon = [x for x in lexicon if x[0] in selected_words_set]

In [None]:
segments = find_segments(filtered_lexicon, window_size=500, 
                         threshold=100, merge_overlaps=True)

In [None]:
show_segments(segments[:100])

# Analysis

In [None]:
face_genders = get_face_genders()
face_genders = face_genders.where(face_genders.in_commercial == False)
face_genders = face_genders.where(face_genders.size_percentile >= 25)

intervals_by_video = defaultdict(list)
for video_id, _, interval, _, _ in segments:
    intervals_by_video[video_id].append(interval)
    
face_genders_with_topic_overlap = annotate_interval_overlap(face_genders, intervals_by_video)
face_genders_with_topic_overlap = face_genders_with_topic_overlap.where(face_genders_with_topic_overlap.overlap_seconds > 0)

## Compare screen time across the entire dataset

In [None]:
gender_map = {g.name : g.id for g in Gender.objects.all()}
distinct_columns = ['face_id']
overlap_field = 'overlap_seconds'
probability_field = 'probability'
z_score = 1.96

topic_screentime_with_woman = sum_distinct_over_column(
    face_genders_with_topic_overlap.where(face_genders_with_topic_overlap.gender_id == gender_map['F']),
    overlap_field,
    distinct_columns,
    probability_column=probability_field
)
print('Woman on screen: {:0.2f}h +/- {:0.02f}'.format(
    topic_screentime_with_woman[0] / 3600, z_score * math.sqrt(topic_screentime_with_woman[1]) / 3600))
topic_screentime_with_man = sum_distinct_over_column(
    face_genders_with_topic_overlap.where(face_genders_with_topic_overlap.gender_id == gender_map['M']),
    overlap_field,
    distinct_columns,
    probability_column=probability_field
)
print('Man on screen: {:0.2f}h +/- {:0.02f}'.format(
    topic_screentime_with_man[0] / 3600, z_score * math.sqrt(topic_screentime_with_man[1]) / 3600))
topic_screentime_with_nh_woman = sum_distinct_over_column(
    face_genders_with_topic_overlap.where(
        (face_genders_with_topic_overlap.gender_id == gender_map['F']) &
        (face_genders_with_topic_overlap.host_probability <= 0.5)
    ),
    overlap_field,
    distinct_columns,
    probability_column=probability_field
)
print('Woman (non-host) on screen: {:0.2f}h +/- {:0.02f}'.format(
    topic_screentime_with_nh_woman[0] / 3600, z_score * math.sqrt(topic_screentime_with_nh_woman[1]) / 3600))
topic_screentime_with_nh_man = sum_distinct_over_column(
    face_genders_with_topic_overlap.where(
        (face_genders_with_topic_overlap.gender_id == gender_map['M']) &
        (face_genders_with_topic_overlap.host_probability <= 0.5)
    ),
    overlap_field,
    distinct_columns,
    probability_column=probability_field
)
print('Man (non-host) on screen: {:0.2f}h  +/- {:0.02f}'.format(
    topic_screentime_with_nh_man[0] / 3600, z_score * math.sqrt(topic_screentime_with_nh_man[1]) / 3600))

## Compare screen time by show

In [None]:
canoncal_show_map = { c.id : c.name for c in CanonicalShow.objects.all() }
distinct_columns = ['face_id']
group_by_columns = ['canonical_show_id']
overlap_field = 'overlap_seconds'
probability_field = 'probability'

### Including hosts

In [None]:
CACHE_BASELINE_INCL_HOST_FILE = '/tmp/base_screentime_gender_incl_host_by_show.pkl'
try:
    with open(CACHE_BASELINE_INCL_HOST_FILE, 'rb') as f:
        base_screentime_with_man_by_show, base_screentime_with_woman_by_show = pickle.load(f)
    print('[Base] loaded from cache')
except:
    base_screentime_with_woman_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders.where((face_genders.gender_id == gender_map['F'])),
            'duration', distinct_columns, group_by_columns,
            probability_column=probability_field
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Base] Woman on screen: done')
    base_screentime_with_man_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders.where((face_genders.gender_id == gender_map['M'])),
            'duration', distinct_columns, group_by_columns,
            probability_column=probability_field
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Base] Man on screen: done')
    
    with open(CACHE_BASELINE_INCL_HOST_FILE, 'wb') as f:
        pickle.dump([base_screentime_with_man_by_show, base_screentime_with_woman_by_show], f)

topic_screentime_with_woman_by_show = {
    canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
    for k, v in sum_distinct_over_column(
        face_genders_with_topic_overlap.where(
            (face_genders_with_topic_overlap.gender_id == gender_map['F'])
        ),
        overlap_field, distinct_columns, group_by_columns,
        probability_column=probability_field
    ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
}
print('[Topic] Woman on screen: done')
topic_screentime_with_man_by_show = {
    canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
    for k, v in sum_distinct_over_column(
        face_genders_with_topic_overlap.where(
            (face_genders_with_topic_overlap.gender_id == gender_map['M'])
        ),
        overlap_field, distinct_columns, group_by_columns,
        probability_column=probability_field
    ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
}
print('[Topic] Man on screen: done')

In [None]:
plot_binary_screentime_proportion_comparison(
    ['Male (incl-host)', 'Female (incl-host)'], 
    [topic_screentime_with_man_by_show, topic_screentime_with_woman_by_show],
    'Proportion of gendered screen time by show for topic "{}"'.format(topic),
    'Show name',
    'Proportion of screen time',
    secondary_series_names=['Baseline Male (incl-host)', 'Baseline Female (incl-host)'],
    secondary_data=[base_screentime_with_man_by_show, base_screentime_with_woman_by_show]
)

### Excluding hosts

In [None]:
CACHE_BASELINE_NO_HOST_FILE = '/tmp/base_screentime_gender_no_host_by_show.pkl'
try:
    with open(CACHE_BASELINE_NO_HOST_FILE, 'rb') as f:
        base_screentime_with_nh_man_by_show, base_screentime_with_nh_woman_by_show = pickle.load(f)
    print('[Base] loaded from cache')
except:
    base_screentime_with_nh_woman_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders.where(
                (face_genders.gender_id == gender_map['F']) &
                (face_genders.host_probability <= 0.25)
            ),
            'duration', distinct_columns, group_by_columns,
            probability_column=probability_field
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Base] Woman (non-host) on screen: done')
    base_screentime_with_nh_man_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders.where(
                (face_genders.gender_id == gender_map['M']) &
                (face_genders.host_probability <= 0.25)
            ),
            'duration', distinct_columns, group_by_columns,
            probability_column=probability_field
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Base] Man (non-host) on screen: done')
    
    with open(CACHE_BASELINE_NO_HOST_FILE, 'wb') as f:
        pickle.dump([base_screentime_with_nh_man_by_show, base_screentime_with_nh_woman_by_show], f)

topic_screentime_with_nh_woman_by_show = {
    canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
    for k, v in sum_distinct_over_column(
        face_genders_with_topic_overlap.where(
            (face_genders_with_topic_overlap.gender_id == gender_map['F']) &
            (face_genders_with_topic_overlap.host_probability <= 0.25)
        ),
        overlap_field, distinct_columns, group_by_columns,
        probability_column=probability_field
    ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
}
print('[Topic] Woman (non-host) on screen: done')
topic_screentime_with_nh_man_by_show = {
    canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
    for k, v in sum_distinct_over_column(
        face_genders_with_topic_overlap.where(
            (face_genders_with_topic_overlap.gender_id == gender_map['M']) &
            (face_genders_with_topic_overlap.host_probability <= 0.25)
        ),
        overlap_field, distinct_columns, group_by_columns,
        probability_column=probability_field
    ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
}
print('[Topic] Man (non-host) on screen: done')

In [None]:
plot_binary_screentime_proportion_comparison(
    ['Male (non-host)', 'Female (non-host)'], 
    [topic_screentime_with_nh_man_by_show, topic_screentime_with_nh_woman_by_show],
    'Proportion of gendered screen time by show for topic "{}"'.format(topic),
    'Show name',
    'Proportion of screen time',
    secondary_series_names=['Baseline Male (non-host)', 'Baseline Female (non-host)'],
    secondary_data=[base_screentime_with_nh_man_by_show, base_screentime_with_nh_woman_by_show],
    tertiary_series_names=['Male (incl-host)', 'Female (incl-host)'],
    tertiary_data=[topic_screentime_with_man_by_show, topic_screentime_with_woman_by_show]
)