<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Choose-a-List-of-Topics" data-toc-modified-id="Choose-a-List-of-Topics-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Choose a List of Topics</a></span></li><li><span><a href="#Analysis" data-toc-modified-id="Analysis-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Analysis</a></span><ul class="toc-item"><li><span><a href="#Compare-mention-count-by-show" data-toc-modified-id="Compare-mention-count-by-show-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Compare mention count by show</a></span></li></ul></li><li><span><a href="#Compare-mentions-by-topic-by-show" data-toc-modified-id="Compare-mentions-by-topic-by-show-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Compare mentions by topic by show</a></span></li><li><span><a href="#Compare-screentime-by-topic-on-all-shows" data-toc-modified-id="Compare-screentime-by-topic-on-all-shows-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Compare screentime by topic on all shows</a></span></li></ul></div>

In [None]:
from esper.prelude import *
from esper.widget import *
from esper.topics import *
from esper.spark_util import *
from esper.plot_util import *
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

import os
from datetime import timedelta
from collections import defaultdict
import _pickle as pickle

# Choose a List of Topics

In [None]:
# topics = [
#     'terrorism',
#     'isis',
#     'vaccine',
#     'travel',
#     'vacation'
# ]
topics = """
harassment
abortion
breast cancer
cervical cancer
mom
children
back to school
birth control
contraceptive
bullying
weight loss
nutrition
women
pregnancy
teens
sexism
stem
prostitution
parenting
feminism
cooking
fashion
terrorism
isis
dad
circumcision
business
men's health
erectile dysfunction
prostate cancer
bombing
evangelicals
national security
nuclear
congress
violence
opiods
hiv
aids
medicine
drugs
obamacare
affordable care act
influenza
autism
cholesterol
obesity
lgbt
sex education
racism
diabetes
marriage
millenial
murder
marijuana
massage
trumpcare
american health care act
cancer
islam
christianity
immigration
hurricane
evangelicals
divorce
cryptocurrency
social security
"""
topics = [x.strip() for x in topics.split('\n') if len(x.strip()) > 0]
topics.sort()
topics

In [None]:
def get_segments(phrase):
    print('Searching: {}'.format(phrase))
    result = caption_search([phrase.upper()])[0]
    return result

topic_to_segments = { 
    t : get_segments(t) for t in topics
}

# Analysis

In [None]:
face_genders = get_face_genders()
face_genders = face_genders.where(
    (face_genders.in_commercial == False) &
    (face_genders.size_percentile >= 25) &
    (face_genders.gender_id != Gender.objects.get(name='U').id)
)

In [None]:
def segments_to_overlapping_face_genders(topic, segments):
    print('Computing overlap for: {}'.format(topic))
    intervals_by_video = defaultdict(list)
    for video_id, intervals in segments.items():
        for interval in intervals:
            if interval[0] > interval[1]:
                continue
            intervals_by_video[video_id].append(interval)
    face_genders_with_topic_overlap = annotate_interval_overlap(
        face_genders, intervals_by_video)
    face_genders_with_topic_overlap = face_genders_with_topic_overlap.where(
        face_genders_with_topic_overlap.overlap_seconds > 0)
    return face_genders_with_topic_overlap

topic_to_face_genders = { 
    t : segments_to_overlapping_face_genders(t, s) 
    for t, s in topic_to_segments.items()
}

## Compare mention count by show

In [None]:
canoncal_show_map = { c.id : c.name for c in CanonicalShow.objects.all() }
distinct_columns = ['face_id']
group_by_columns = ['canonical_show_id']

In [None]:
CACHE_BASELINE_NO_HOST_FILE = '/tmp/base_screentime_gender_no_host_by_show.pkl'
try:
    with open(CACHE_BASELINE_NO_HOST_FILE, 'rb') as f:
        base_screentime_with_nh_man_by_show, base_screentime_with_nh_woman_by_show = pickle.load(f)
    print('[Base] loaded from cache')
except:
    base_screentime_with_nh_woman_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders.where(face_genders.host_probability <= 0.25),
            'duration', distinct_columns, group_by_columns,
            probability_column='female_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Base] Woman (non-host) on screen: done')
    base_screentime_with_nh_man_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders.where(face_genders.host_probability <= 0.25),
            'duration', distinct_columns, group_by_columns,
            probability_column='male_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Base] Man (non-host) on screen: done')
    
    with open(CACHE_BASELINE_NO_HOST_FILE, 'wb') as f:
        pickle.dump([base_screentime_with_nh_man_by_show, base_screentime_with_nh_woman_by_show], f)

In [None]:
distinct_columns = []

def aggregate_face_genders_by_show(topic, face_genders_with_topic_overlap):
    topic_mentions_with_woman_by_show = {
        canoncal_show_map[k[0]] : v
        for k, v in count_distinct_over_column(
            face_genders_with_topic_overlap,
            distinct_columns, group_by_columns=group_by_columns,
            probability_column='female_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[{}] Woman on screen: done'.format(topic))
    topic_mentions_with_man_by_show = {
        canoncal_show_map[k[0]] : v
        for k, v in count_distinct_over_column(
            face_genders_with_topic_overlap,
            distinct_columns, group_by_columns=group_by_columns,
            probability_column='male_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[{}] Man on screen: done'.format(topic))
    topic_mentions_with_nh_woman_by_show = {
        canoncal_show_map[k[0]] : v
        for k, v in count_distinct_over_column(
            face_genders_with_topic_overlap.where(
                face_genders_with_topic_overlap.host_probability <= 0.25),
            distinct_columns, group_by_columns=group_by_columns,
            probability_column='female_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[{}] Woman (non-host) on screen: done'.format(topic))
    topic_mentions_with_nh_man_by_show = {
        canoncal_show_map[k[0]] : v
        for k, v in count_distinct_over_column(
            face_genders_with_topic_overlap.where(
                face_genders_with_topic_overlap.host_probability <= 0.25),
            distinct_columns, group_by_columns=group_by_columns,
            probability_column='male_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[{}] Man (non-host) on screen: done'.format(topic))
    result = {
        'woman': topic_mentions_with_woman_by_show,
        'man': topic_mentions_with_man_by_show,
        'woman_nh': topic_mentions_with_nh_woman_by_show,
        'man_nh': topic_mentions_with_nh_man_by_show
    }
    return result

topic_to_aggregates = { 
    t : aggregate_face_genders_by_show(t, fg)
    for t, fg in topic_to_face_genders.items()
} 

In [None]:
channel_name_cmap = { 'CNN': 'DarkBlue', 'FOXNEWS': 'DarkRed', 'MSNBC': 'DarkGreen' } 
canonical_show_cmap = {
    v['show__canonical_show__name'] : channel_name_cmap[v['channel__name']]
    for v in Video.objects.distinct(
        'show__canonical_show'
    ).values('show__canonical_show__name', 'channel__name')
}

for t, agg in sorted(topic_to_aggregates.items()):
    plot_binary_proportion_comparison(
        ['Male (Excl. Hosts)', 'Female (Excl. Hosts)'], 
        [agg['man_nh'], agg['woman_nh']],
        'Proportion of faces on screen during keyword mentions of "{}"'.format(t),
        'Show name',
        'Proportion of keyword mentions',
        secondary_series_names=[
            'Baseline Male (Screen Time Excl. Hosts)', 'Baseline Female (Screen Time Excl. Hosts)'
        ],
        secondary_data=[
            base_screentime_with_nh_man_by_show, 
            base_screentime_with_nh_woman_by_show
        ],
        tertiary_series_names=['Male (Incl. Hosts)', 'Female (Incl. Hosts)'],
        tertiary_data=[agg['man'], agg['woman']],
        category_color_map=canonical_show_cmap,
        raw_data_to_label_fn=lambda x: '{:d}'.format(int(x))
    )

# Compare mentions by topic by show

In [None]:
distinct_columns = []
z_score = 1.96

def aggregate_face_genders_by_topic(topic, face_genders_with_topic_overlap):
    topic_mentions_with_nh_woman = count_distinct_over_column(
        face_genders_with_topic_overlap.where(
            face_genders_with_topic_overlap.host_probability <= 0.5
        ),
        distinct_columns, 
        probability_column='female_probability'
    )
    print('Woman (non-host) on screen: {:0.2f} +/- {:0.02f}'.format(
        topic_mentions_with_nh_woman[0], 
        z_score * math.sqrt(topic_mentions_with_nh_woman[1])))
    topic_mentions_with_nh_man = count_distinct_over_column(
        face_genders_with_topic_overlap.where(
            face_genders_with_topic_overlap.host_probability <= 0.5
        ),
        distinct_columns,
        probability_column='male_probability'
    )
    print('Man (non-host) on screen: {:0.2f}  +/- {:0.02f}'.format(
        topic_mentions_with_nh_man[0], 
        z_score * math.sqrt(topic_mentions_with_nh_man[1])))
    
    result = {
        'man_nh': (
            topic_mentions_with_nh_man[0],
            topic_mentions_with_nh_man[1]
        ),
        'woman_nh': (
            topic_mentions_with_nh_woman[0],
            topic_mentions_with_nh_woman[1]
        )
    }
    return result

topic_to_base_aggregates = { 
    t : aggregate_face_genders_by_topic(t, fg)
    for t, fg in topic_to_face_genders.items()
}
base_topic_man_nh = { 
    k : v['man_nh'] for k, v in topic_to_base_aggregates.items()
}
base_topic_woman_nh = { 
    k : v['woman_nh'] for k, v in topic_to_base_aggregates.items()
}

In [None]:
MIN_TOPIC_MENTIONS = 0

for show in sorted(MAJOR_CANONICAL_SHOWS):
    topic_man = {}
    topic_woman = {}
    topic_man_nh = {}
    topic_woman_nh = {}
    
    for t, agg_by_show in topic_to_aggregates.items():
        topic_man[t] = agg_by_show['man'].get(show, (0, 0))
        topic_woman[t] = agg_by_show['woman'].get(show, (0, 0))
        topic_man_nh[t] = agg_by_show['man_nh'].get(show, (0, 0))
        topic_woman_nh[t] = agg_by_show['woman_nh'].get(show, (0, 0))
    
    plot_binary_proportion_comparison(
        ['Male (Excl. Hosts)', 'Female (Excl. Hosts)'], 
        [
            { k : v for k, v in topic_man_nh.items() if v[0] >= MIN_TOPIC_MENTIONS}, 
            topic_woman_nh
        ],
        'Proportion of faces on screen during keyword mentions on "{}"'.format(show),
        'Topic name',
        'Proportion of keyword mentions',
        secondary_series_names=[
            'Baseline Male (Excl. Hosts) for Topic', 
            'Baseline Female (Excl. Hosts) for Topic'],
        secondary_data=[base_topic_man_nh, base_topic_woman_nh],
        tertiary_series_names=['Male (Incl. Hosts)', 'Female (Incl. Hosts)'],
        tertiary_data=[topic_man, topic_woman],
        raw_data_to_label_fn=lambda x: '{:d}'.format(int(x))
    )

# Compare screentime by topic on all shows

In [None]:
topic_to_male_mentions = {}
topic_to_female_mentions = {}
for topic in topics:
    print(topic)
    distinct_columns = []
    try:
        tmp_f = count_distinct_over_column(
            topic_to_face_genders[topic],
            distinct_columns,
            probability_column='female_probability'
        )
        tmp_m = count_distinct_over_column(
            topic_to_face_genders[topic],
            distinct_columns,
            probability_column='male_probability'
        )
        topic_to_female_mentions[topic] = tmp_f
        topic_to_male_mentions[topic] = tmp_m
    except Exception as e:
        print(e)

In [None]:
topic_to_male_mentions, topic_to_female_mentions

In [None]:
BASELINE_MALE_PROPORTION = 0.6
MIN_TOPIC_MENTIONS = 50
plot_binary_proportion_comparison(
    ['Male', 'Female'], 
    [
        { k : v for k, v in topic_to_male_mentions.items() if v[0] >= MIN_TOPIC_MENTIONS }, 
        topic_to_female_mentions
    ],
    'Proportion of faces shown on screen during keyword mentions',
    'Keyword',
    'Proportion of faces',
    raw_data_to_label_fn=lambda x: '{:d}'.format(int(x)),
    baseline_series_names=['Baseline Male (Screentime)', 'Baseline Female (Screentime)'],
    baseline_data=[BASELINE_MALE_PROPORTION, 1 - BASELINE_MALE_PROPORTION]
)