<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Choose-a-List-of-Topics" data-toc-modified-id="Choose-a-List-of-Topics-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Choose a List of Topics</a></span></li><li><span><a href="#Analysis" data-toc-modified-id="Analysis-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Analysis</a></span><ul class="toc-item"><li><span><a href="#Compare-screen-time-by-show" data-toc-modified-id="Compare-screen-time-by-show-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Compare screen time by show</a></span></li><li><span><a href="#Annotate-Color-By-Opinion/Talk-vs.-News-Show" data-toc-modified-id="Annotate-Color-By-Opinion/Talk-vs.-News-Show-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Annotate Color By Opinion/Talk vs. News Show</a></span></li></ul></li><li><span><a href="#Compare-screentime-by-topic-by-show" data-toc-modified-id="Compare-screentime-by-topic-by-show-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Compare screentime by topic by show</a></span></li><li><span><a href="#Compare-screentime-by-topic-on-all-shows" data-toc-modified-id="Compare-screentime-by-topic-on-all-shows-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Compare screentime by topic on all shows</a></span></li></ul></div>

In [None]:
from esper.prelude import *
from esper.widget import *
from esper.topics import *
from esper.spark_util import *
from esper.plot_util import *
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

import os
from datetime import timedelta
from collections import defaultdict
import _pickle as pickle

# Choose a List of Topics

In [None]:
topics = [
    'terrorism', 'isis', 'syria', 'refugee',
    'collusion', 'russia',
    'shooting', 'black lives', 
    'san bernardino', 'pulse nightclub', 'vegas shooter', 'charleston church',
    'charlie hebdo', 'paris attacks',
    'trayvon martin', 'walter scott',
    'immigration', 'travel ban', 'border wall',
    'roy moore', 'harassment', 'email scandal', 'billy bush',
    'global warming', 'paris climate',
    'autism', 
    'planned parenthood', 'abortion',
    'gay marriage', 'lgbt',
    'fashion', 'wedding',
    'facebook', 
    'irs', 'taxes',
    'school', 'preschool',
    'nutrition', 'healthcare',
    'yoga', 'asthma', 'flu',
    'public transportation',
    'travel',
    'vacation',
    'wall street', 'economy', 'trade',
    'national security', 'north korea',
    'guns', 'education',
    'supreme court', 'social security',
    'racism', 'afghanistan', 'iraq',
    'england', 'europe', 'france',
    'football', 'soccer', 'fifa',
    'asia', 'africa', 'brazil',
    'mexico', 'canada', 'violence',
    'internet', 'technology', 'baseball',
    'olympics', 'iran', 'crime',
    'food', 'disease', 'cancer',
    'drugs', 'college', 'police',
    'oscars', 'rifle', 'apple',
    'blockchain', 'congress',
    'president', 'america', 'veteran',
    'music', 'film', 'dance',
    'book', 'ebola', 'census', 'cars',
    'import', 'export', 'christmas', 
    'july 4th', 'india', 'wine', 'wildfire',
    'earthquake', 'flood', 'hurricane', 'jobs',
    'luxury goods', 'golf', 'doping',
    'ipo', 'bankruptcy', 'literature',
    'millennials', 'fiction', 'greece',
    'italy', 'liberals', 'conservatives',
    'catholic', 'evangelical', 'nafta', 
    'opiod', 'cats', 'dogs', 'wildlife',
    'pets', 'democrat', 'republcian', 
    'media', 'death penalty', 'meme',
    'twitter', 'nasa', 'sports', 'gym', 'medicine',
    'affair', 'banks', 'agriculture', 'coal', 'oil',
    'renewable energy', 'google', 'gaming', 'artificial intelligence',
    'spy', 'wiretap', 'obamacare', 'obesity',
    'cyberbullying', 'netflix', 'basketball',
    'native american', 'african american', 'hispanic', 
    'tennis', 'inequality', 'foreign policy', 'charity',
    'marijuana', 'childcare', 'insurance', 'vaccine',
    'christianity', 'constitution', 'slavery', 'confederate',
    'poverty', 'homeless', 'prison', 'patents', 'teens',
    'elderly', 'stock market', 'consumers',
    'cybersecurity', 'illegal drugs', 'prescription drugs',
    'nato', 'freedom of speech', 'freedom of information', 'patriot act',
    'due process', 'flint water crisis', 'government shutdown'
]

In [None]:
LEXICON_CACHE_PATH = '/tmp/topic_lexicons'
if not os.path.isdir(LEXICON_CACHE_PATH):
    os.makedirs(LEXICON_CACHE_PATH)

def get_lexicon(topic):
    cache_path = os.path.join(LEXICON_CACHE_PATH, '{}.pkl'.format(topic))
    try:
        with open(cache_path, 'rb') as f:
            print('Loading {} lexicon from cache'.format(topic))
            return pickle.load(f)
    except:
        print('Could not load {} lexicon from cache'.format(topic))
    lexicon = mutual_info(topic)
    with open(cache_path, 'wb') as f:
        print('Caching lexicon: {}'.format(cache_path))
        pickle.dump(lexicon, f)
    return lexicon

topic_to_lexicon = { t : get_lexicon(t) for t in topics }

In [None]:
SEGMENT_CACHE_PATH = '/tmp/topic_segments'
if not os.path.isdir(SEGMENT_CACHE_PATH):
    os.makedirs(SEGMENT_CACHE_PATH)
    
def get_segments(topic, lexicon):
    cache_path = os.path.join(SEGMENT_CACHE_PATH, '{}.pkl'.format(topic))
    try:
        with open(cache_path, 'rb') as f:
            print('Loading {} segments from cache'.format(topic))
            return pickle.load(f)
    except:
        print('Could not load {} segments from cache'.format(topic))
    segments = find_segments(lexicon, window_size=200, threshold=50, 
                             merge_overlaps=True)
    with open(cache_path, 'wb') as f:
        print('Caching segments: {}'.format(cache_path))
        pickle.dump(segments, f)
    return segments

topic_to_segments = { 
    t : get_segments(t, l) for t, l in topic_to_lexicon.items()
}

# Analysis

In [None]:
face_genders = get_face_genders()
face_genders = face_genders.where(
    (face_genders.in_commercial == False) &
    (face_genders.size_percentile >= 25) &
    (face_genders.gender_id != Gender.objects.get(name='U').id)
)

In [None]:
def segments_to_overlapping_face_genders(topic, segments):
    print('Computing overlap for: {}'.format(topic))
    intervals_by_video = defaultdict(list)
    for video_id, _, interval, _, _ in segments:
        intervals_by_video[video_id].append(interval)
    face_genders_with_topic_overlap = annotate_interval_overlap(
        face_genders, intervals_by_video)
    face_genders_with_topic_overlap = face_genders_with_topic_overlap.where(
        face_genders_with_topic_overlap.overlap_seconds > 0)
    return face_genders_with_topic_overlap

topic_to_face_genders = { 
    t : segments_to_overlapping_face_genders(t, s) 
    for t, s in topic_to_segments.items()
}

## Compare screen time by show

In [None]:
canoncal_show_map = { c.id : c.name for c in CanonicalShow.objects.all() }
distinct_columns = ['face_id']
group_by_columns = ['canonical_show_id']
overlap_field = 'overlap_seconds'

In [None]:
CACHE_BASELINE_NO_HOST_FILE = '/tmp/base_screentime_gender_no_host_by_show.pkl'
try:
    with open(CACHE_BASELINE_NO_HOST_FILE, 'rb') as f:
        base_screentime_with_nh_man_by_show, base_screentime_with_nh_woman_by_show = pickle.load(f)
    print('[Base] loaded from cache')
except:
    base_screentime_with_nh_woman_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders.where(face_genders.host_probability <= 0.25),
            'duration', distinct_columns, group_by_columns,
            probability_column='female_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Base] Woman (non-host) on screen: done')
    base_screentime_with_nh_man_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders.where(face_genders.host_probability <= 0.25),
            'duration', distinct_columns, group_by_columns,
            probability_column='male_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[Base] Man (non-host) on screen: done')
    
    with open(CACHE_BASELINE_NO_HOST_FILE, 'wb') as f:
        pickle.dump([base_screentime_with_nh_man_by_show, base_screentime_with_nh_woman_by_show], f)

In [None]:
CACHE_AGGREGATES_DIR = '/tmp/topic_aggregates'
if not os.path.isdir(CACHE_AGGREGATES_DIR):
    os.makedirs(CACHE_AGGREGATES_DIR)

distinct_columns = []

def aggregate_face_genders_by_show(topic, face_genders_with_topic_overlap):
    cache_path = os.path.join(CACHE_AGGREGATES_DIR, '{}.pkl'.format(topic))
    try:
        with open(cache_path, 'rb') as f:
            print('Loading {} aggregates from cache'.format(topic))
            return pickle.load(f)
    except:
        print('Could not load {} aggregates from cache'.format(topic))
    
    topic_screentime_with_woman_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders_with_topic_overlap,
            overlap_field, distinct_columns, group_by_columns,
            probability_column='female_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[{}] Woman on screen: done'.format(topic))
    topic_screentime_with_man_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders_with_topic_overlap,
            overlap_field, distinct_columns, group_by_columns,
            probability_column='male_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[{}] Man on screen: done'.format(topic))
    topic_screentime_with_nh_woman_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders_with_topic_overlap.where(
                face_genders_with_topic_overlap.host_probability <= 0.25),
            overlap_field, distinct_columns, group_by_columns,
            probability_column='female_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[{}] Woman (non-host) on screen: done'.format(topic))
    topic_screentime_with_nh_man_by_show = {
        canoncal_show_map[k[0]] : (timedelta(seconds=v[0]), v[1])
        for k, v in sum_distinct_over_column(
            face_genders_with_topic_overlap.where(
                face_genders_with_topic_overlap.host_probability <= 0.25),
            overlap_field, distinct_columns, group_by_columns,
            probability_column='male_probability'
        ).items() if canoncal_show_map[k[0]] in MAJOR_CANONICAL_SHOWS
    }
    print('[{}] Man (non-host) on screen: done'.format(topic))
    result = {
        'woman': topic_screentime_with_woman_by_show,
        'man': topic_screentime_with_man_by_show,
        'woman_nh': topic_screentime_with_nh_woman_by_show,
        'man_nh': topic_screentime_with_nh_man_by_show
    }
    
    with open(cache_path, 'wb') as f:
        print('Caching aggregates: {}'.format(cache_path))
        pickle.dump(result, f)
    return result

topic_to_aggregates = { 
    t : aggregate_face_genders_by_show(t, fg)
    for t, fg in topic_to_face_genders.items()
} 

In [None]:
channel_name_cmap = { 'CNN': 'DarkBlue', 'FOXNEWS': 'DarkRed', 'MSNBC': 'DarkGreen' } 
canonical_show_cmap = {
    v['show__canonical_show__name'] : channel_name_cmap[v['channel__name']]
    for v in Video.objects.distinct(
        'show__canonical_show'
    ).values('show__canonical_show__name', 'channel__name')
}

for t, agg in sorted(topic_to_aggregates.items()):
    plot_binary_screentime_proportion_comparison(
        ['Male (non-host)', 'Female (non-host)'], 
        [agg['man_nh'], agg['woman_nh']],
        'Proportion of gendered screen time by show for topic "{}"'.format(t),
        'Show name',
        'Proportion of screen time',
        secondary_series_names=[
            'Baseline Male (non-host)', 'Baseline Female (non-host)'
        ],
        secondary_data=[
            base_screentime_with_nh_man_by_show, 
            base_screentime_with_nh_woman_by_show
        ],
        tertiary_series_names=['Male (incl-host)', 'Female (incl-host)'],
        tertiary_data=[agg['man'], agg['woman']],
        category_color_map=canonical_show_cmap
    )

## Annotate Color By Opinion/Talk vs. News Show

In [None]:
IS_OPINION_TALK_SHOW = {
    'MSNBC Live': False,
    'New Day': False,
    'The Rachel Maddow Show': True,
    'Anderson Cooper 360': False,
    'Hannity': True,
    'All In With Chris Hayes': True,
    'CNN Tonight': False,
    'Early Start': False, # morning show
    'Happening Now': True,
    'Hardball': True,
    'The Kelly File': True, # Spun off oreilly factor
    'The OReilly Factor': True,
    'The Last Word With Lawrence ODonnell': True,
    'CNN Newsroom With Brooke Baldwin': False,
    'Americas News HQ': False,
    'CNN Newsroom': False,
    'Special Report With Bret Baier': True,
    'The Five': True, 
    'On the Record': False,
    'Situation Room With Wolf Blitzer': False,
    'CNN Newsroom With Fredricka Whitfield': False,
    'FOX and Friends First': True,
    'Outnumbered': True,
    'FOX and Friends': True,
    'Shepard Smith Reporting': False,
    'Morning Joe': True, # morning show
    'Wolf': False,
    'Americas Newsroom': True,
    'CNN Newsroom With Carol Costello': False,
    'The Lead With Jake Tapper': True,
    'Your World With Neil Cavuto': True, # business focus?
    'First Look': True, # Morning Joe : First Look
    'At This Hour': False, # morning show
    'Erin Burnett OutFront': False, 
    'Red Eye': True,
    'The 11th Hour With Brian Williams': True, # news and politics television program 
    'Tucker Carlson Tonight': True,
    'FOX Report': False,
    'CNN Newsroom With Poppy Harlow': False,
    'MTP Daily': True, # Public affairs/news analysis program
}

In [None]:
opinion_cmap = defaultdict(lambda: 'Black')
for k, v in IS_OPINION_TALK_SHOW.items():
    opinion_cmap[k] = 'Blue' if v else 'Red'

for t, agg in sorted(topic_to_aggregates.items()):
    plot_binary_screentime_proportion_comparison(
        ['Male (non-host)', 'Female (non-host)'], 
        [agg['man_nh'], agg['woman_nh']],
        'Proportion of gendered screen time by show for topic "{}"'.format(t),
        'Show name',
        'Proportion of screen time',
        secondary_series_names=[
            'Baseline Male (non-host)', 'Baseline Female (non-host)'
        ],
        secondary_data=[
            base_screentime_with_nh_man_by_show, 
            base_screentime_with_nh_woman_by_show
        ],
        tertiary_series_names=['Male (incl-host)', 'Female (incl-host)'],
        tertiary_data=[agg['man'], agg['woman']],
        category_color_map=opinion_cmap
    )

# Compare screentime by topic by show

In [None]:
CACHE_AGGREGATES_DIR = '/tmp/topic_aggregates'
if not os.path.isdir(CACHE_AGGREGATES_DIR):
    os.makedirs(CACHE_AGGREGATES_DIR)

distinct_columns = []
overlap_field = 'overlap_seconds'
z_score = 1.96

def aggregate_face_genders_by_topic(topic, face_genders_with_topic_overlap):
    cache_path = os.path.join(CACHE_AGGREGATES_DIR, '{}.base.pkl'.format(topic))
    try:
        with open(cache_path, 'rb') as f:
            print('Loading {} base aggregates from cache'.format(topic))
            return pickle.load(f)
    except:
        print('Could not load {} base aggregates from cache'.format(topic))
    
    topic_screentime_with_nh_woman = sum_distinct_over_column(
        face_genders_with_topic_overlap.where(
            face_genders_with_topic_overlap.host_probability <= 0.5
        ),
        overlap_field, distinct_columns, 
        probability_column='female_probability'
    )
    print('Woman (non-host) on screen: {:0.2f}h +/- {:0.02f}'.format(
        topic_screentime_with_nh_woman[0] / 3600, 
        z_score * math.sqrt(topic_screentime_with_nh_woman[1]) / 3600))
    topic_screentime_with_nh_man = sum_distinct_over_column(
        face_genders_with_topic_overlap.where(
            face_genders_with_topic_overlap.host_probability <= 0.5
        ),
        overlap_field, distinct_columns,
        probability_column='male_probability'
    )
    print('Man (non-host) on screen: {:0.2f}h  +/- {:0.02f}'.format(
        topic_screentime_with_nh_man[0] / 3600, 
        z_score * math.sqrt(topic_screentime_with_nh_man[1]) / 3600))
    
    result = {
        'man_nh': (
            timedelta(seconds=topic_screentime_with_nh_man[0]),
            topic_screentime_with_nh_man[1]
        ),
        'woman_nh': (
            timedelta(seconds=topic_screentime_with_nh_woman[0]),
            topic_screentime_with_nh_woman[1]
        )
    }
    
    with open(cache_path, 'wb') as f:
        print('Caching base aggregates: {}'.format(cache_path))
        pickle.dump(result, f)
    return result

topic_to_base_aggregates = { 
    t : aggregate_face_genders_by_topic(t, fg)
    for t, fg in topic_to_face_genders.items()
}
base_topic_man_nh = { 
    k : v['man_nh'] for k, v in topic_to_base_aggregates.items()
}
base_topic_woman_nh = { 
    k : v['woman_nh'] for k, v in topic_to_base_aggregates.items()
}

In [None]:
MIN_TOPIC_SECONDS = 30 * 60 # 1/2 hr

for show in sorted(MAJOR_CANONICAL_SHOWS):
    topic_man = {}
    topic_woman = {}
    topic_man_nh = {}
    topic_woman_nh = {}
    
    for t, agg_by_show in topic_to_aggregates.items():
        topic_man[t] = agg_by_show['man'].get(show, (timedelta(0), 0))
        topic_woman[t] = agg_by_show['woman'].get(show, (timedelta(0), 0))
        topic_man_nh[t] = agg_by_show['man_nh'].get(show, (timedelta(0), 0))
        topic_woman_nh[t] = agg_by_show['woman_nh'].get(show, (timedelta(0), 0))
    
    plot_binary_screentime_proportion_comparison(
        ['Male (non-host)', 'Female (non-host)'], 
        [
            { k : v for k, v in topic_man_nh.items() if v[0].total_seconds() >= MIN_TOPIC_SECONDS}, 
            topic_woman_nh
        ],
        'Proportion of gendered screen time by show for topic "{}"'.format(show),
        'Topic name',
        'Proportion of screen time',
        secondary_series_names=[
            'Baseline Male (non-host) for Topic', 
            'Baseline Female (non-host) for Topic'],
        secondary_data=[base_topic_man_nh, base_topic_woman_nh],
        tertiary_series_names=['Male (incl-host)', 'Female (incl-host)'],
        tertiary_data=[topic_man, topic_woman],
#         baseline_series_names=[
#             'Baseline Male (non-host) on "{}"'.format(show), 
#             'Baseline Female (non-host) on "{}"'.format(show)],
#         baseline_data=[
#             base_screentime_with_nh_man_by_show[show][0].total_seconds(),
#             base_screentime_with_nh_woman_by_show[show][0].total_seconds()
#         ],
    )

# Compare screentime by topic on all shows

In [None]:
topic_to_male_screentime = {}
topic_to_female_screentime = {}

In [None]:
overlap_field = 'overlap_seconds'
distinct_columns = []
for topic in topic_to_face_genders:
    print(topic)
    if topic in topic_to_female_screentime:
        continue
    try:
        female_topic = sum_distinct_over_column(
            topic_to_face_genders[topic],
            overlap_field, distinct_columns,
            probability_column='female_probability'
        )
        male_topic = sum_distinct_over_column(
            topic_to_face_genders[topic],
            overlap_field, distinct_columns,
            probability_column='male_probability'
        )
        topic_to_female_screentime[topic] = female_topic
        topic_to_male_screentime[topic] = male_topic
    except Exception as e:
        print(e)

In [None]:
topic_to_male_screentime, topic_to_female_screentime

In [None]:
plot_binary_screentime_proportion_comparison(
    ['Male (non-host)', 'Female (non-host)'], 
    [
        { k : v for k, v in topic_to_male_screentime.items() if v[0] >= MIN_TOPIC_SECONDS }, 
        topic_to_female_screentime
    ],
    'Proportion of gendered screen time by topic',
    'Topic name',
    'Proportion of screen time'
)