<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Instructions" data-toc-modified-id="Instructions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Instructions</a></span></li></ul></div>

# Instructions

Run all of the cells. You will be prompted to input a topic.

In [None]:
from esper.prelude import *
from esper.widget import *
from esper.topics import *
from esper.spark_util import *
from esper.plot_util import *
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

from datetime import timedelta
from collections import defaultdict
import _pickle as pickle

FACE_GENDERS = get_face_genders()
FACE_GENDERS = FACE_GENDERS.where(
    (FACE_GENDERS.in_commercial == False) &
    (FACE_GENDERS.size_percentile >= 25) &
    (FACE_GENDERS.gender_id != Gender.objects.get(name='U').id)
)

CACHE_BASELINE_NO_HOST_FILE = '/tmp/base_screentime_gender_no_host_by_show.pkl'
def get_base_screentime_by_show():
    try:
        with open(CACHE_BASELINE_NO_HOST_FILE, 'rb') as f:
            return pickle.load(f)
    except:
        print('Could not load baseline gender by show from cache.')
    nh_woman = {
        CANONICAL_SHOW_MAP[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            FACE_GENDERS.where(FACE_GENDERS.host_probability <= 0.25),
            'duration', distinct_columns, group_by_columns,
            probability_column='female_probability'
        ).items() if CANONICAL_SHOW_MAP[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    nh_man = {
        CANONICAL_SHOW_MAP[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            FACE_GENDERS.where(FACE_GENDERS.host_probability <= 0.25),
            'duration', distinct_columns, group_by_columns,
            probability_column='male_probability'
        ).items() if CANONICAL_SHOW_MAP[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    with open(CACHE_BASELINE_NO_HOST_FILE, 'wb') as f:
        pickle.dump([nh_man, nh_woman], f)
    return nh_man, nh_woman

BASE_SCREENTIME_NH_MAN_BY_SHOW, BASE_SCREENTIME_NH_WOMAN_BY_SHOW = \
    get_base_screentime_by_show()
        
CANONICAL_SHOW_MAP = { c.id : c.name for c in CanonicalShow.objects.all() }
CHANNEL_NAME_CMAP = { 
    'CNN': 'DarkBlue', 
    'FOXNEWS': 'DarkRed', 
    'MSNBC': 'DarkGreen'
}
CANONICAL_SHOW_CMAP = { 
    v['show__canonical_show__name'] : CHANNEL_NAME_CMAP[v['channel__name']]
    for v in Video.objects.distinct(
        'show__canonical_show'
    ).values('show__canonical_show__name', 'channel__name')
}

def run_analysis(topic):
    print('Building the topic lexicon')
    lexicon = mutual_info(topic)
    print('Searching for segments')
    segments = find_segments(lexicon, window_size=500, 
                             threshold=100, merge_overlaps=True)
    intervals_by_video = defaultdict(list)
    for video_id, _, interval, _, _ in segments:
        intervals_by_video[video_id].append(interval)

    face_genders_with_topic_overlap = annotate_interval_overlap(
        FACE_GENDERS, intervals_by_video)
    face_genders_with_topic_overlap = face_genders_with_topic_overlap.where(
        face_genders_with_topic_overlap.overlap_seconds > 0)
    
    distinct_columns = []
    group_by_columns = ['canonical_show_id']
    overlap_field = 'overlap_seconds'
    
    print('Computing screen times with gender')
    topic_screentime_with_woman_by_show = {
        CANONICAL_SHOW_MAP[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders_with_topic_overlap,
            overlap_field, distinct_columns, group_by_columns,
            probability_column='female_probability'
        ).items() if CANONICAL_SHOW_MAP[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Topic] Woman on screen: done')
    topic_screentime_with_man_by_show = {
        CANONICAL_SHOW_MAP[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders_with_topic_overlap,
            overlap_field, distinct_columns, group_by_columns,
            probability_column='male_probability'
        ).items() if CANONICAL_SHOW_MAP[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Topic] Man on screen: done')
    topic_screentime_with_nh_woman_by_show = {
        CANONICAL_SHOW_MAP[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders_with_topic_overlap.where(
                face_genders_with_topic_overlap.host_probability <= 0.25),
            overlap_field, distinct_columns, group_by_columns,
            probability_column='female_probability'
        ).items() if CANONICAL_SHOW_MAP[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Topic] Woman (non-host) on screen: done')
    topic_screentime_with_nh_man_by_show = {
        CANONICAL_SHOW_MAP[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders_with_topic_overlap.where(
                face_genders_with_topic_overlap.host_probability <= 0.25),
            overlap_field, distinct_columns, group_by_columns,
            probability_column='male_probability'
        ).items() if CANONICAL_SHOW_MAP[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Topic] Man (non-host) on screen: done')
    
    plot_binary_screentime_proportion_comparison(
        ['Male (non-host)', 'Female (non-host)'], 
        [
            topic_screentime_with_nh_man_by_show, 
            topic_screentime_with_nh_woman_by_show
        ],
        'Proportion of gendered screen time by show for topic "{}"'.format(topic),
        'Show name',
        'Proportion of screen time',
        secondary_series_names=['Baseline Male (non-host)', 'Baseline Female (non-host)'],
        secondary_data=[BASE_SCREENTIME_NH_MAN_BY_SHOW, 
                        BASE_SCREENTIME_NH_WOMAN_BY_SHOW],
        tertiary_series_names=['Male (incl-host)', 'Female (incl-host)'],
        tertiary_data=[topic_screentime_with_man_by_show, 
                       topic_screentime_with_woman_by_show],
        category_color_map=CANONICAL_SHOW_CMAP
    )
    print('X-axis color map: {}'.format(', '.join('{}: {}'.format(x, y) 
                                                  for x, y in CHANNEL_NAME_CMAP.items())))

In [None]:
run_analysis(input('Input a topic: ').strip())