<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Shows" data-toc-modified-id="Shows-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Shows</a></span><ul class="toc-item"><li><span><a href="#Show-Info" data-toc-modified-id="Show-Info-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Show Info</a></span></li><li><span><a href="#Gender-By-Show" data-toc-modified-id="Gender-By-Show-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Gender By Show</a></span></li><li><span><a href="#Identity-by-Show" data-toc-modified-id="Identity-by-Show-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Identity by Show</a></span></li></ul></li><li><span><a href="#Topics" data-toc-modified-id="Topics-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Topics</a></span><ul class="toc-item"><li><span><a href="#Topic-Lexicons" data-toc-modified-id="Topic-Lexicons-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Topic Lexicons</a></span></li><li><span><a href="#Gender-By-Topic" data-toc-modified-id="Gender-By-Topic-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Gender By Topic</a></span></li><li><span><a href="#Identity-by-Topic" data-toc-modified-id="Identity-by-Topic-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Identity by Topic</a></span></li></ul></li></ul></div>

In [None]:
from esper.widget import *
from esper.prelude import *
from esper.spark_util import *
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

import pyspark.sql.functions as func
from datetime import timedelta, datetime
from collections import defaultdict
import random
import json
import os

In [None]:
OVERWRITE = True

In [None]:
face_genders = get_face_genders()
face_genders = face_genders.where(face_genders.labeler_id != Labeler.objects.get(name='handlabeled-gender').id)
face_genders = face_genders.withColumn('date', func.date_format('time', 'yyyy-MM-dd'))

In [None]:
face_identities = get_face_identities()
face_identities = face_identities.where(face_identities.labeler_id.isin(
    [l.id for l in Labeler.objects.filter(name__contains='face-identity')]
))
face_identities = face_identities.withColumn('date', func.date_format('time', 'yyyy-MM-dd'))

In [None]:
def json_keys(d):
    def tuple_to_str(t):
        return ':'.join([str(x) for x in t])
    return { tuple_to_str(k) : v for k, v in d.items() }

def capitalize_name(s):
    return ' '.join([x.capitalize() for x in s.split(' ')])

identity_map = { x.id : capitalize_name(x.name) for x in Identity.objects.all() }

canonical_show_map = {
    c.id : c.name for c in CanonicalShow.objects.all() 
    if c.name in MAJOR_CANONICAL_SHOWS
}

channel_map = {
    c.id : c.name for c in Channel.objects.all()
}

# Shows

## Show Info

In [None]:
canonical_show_to_info = {
    c : {
        'channel': Video.objects.filter(show__canonical_show__name=c)[0].channel.name,
        'aliases': [s.name for s in Show.objects.filter(canonical_show__name=c).order_by('name')] 
    }
    for c in MAJOR_CANONICAL_SHOWS
}

SHOW_INFO_PATH = 'widget_data/show_info.json'
if not OVERWRITE and os.path.exists(SHOW_INFO_PATH):
    raise Exception('File exists!')
    
with open(SHOW_INFO_PATH, 'w') as f:
    json.dump(canonical_show_to_info, f)

## Gender By Show

In [None]:
def get_show_date(k):
    cshow_id, channel_id, date = k
    default_show = 'Other ({})'.format(channel_map[channel_id])
    return (canonical_show_map.get(cshow_id, default_show), date)

# Including host
screen_time_male_by_show_date = {}
for k, v in sum_over_column(
    face_genders, 'duration', ['canonical_show_id', 'channel_id', 'date'],
    probability_column='male_probability'
).items():
    screen_time_male_by_show_date[get_show_date(k)] = v

screen_time_female_by_show_date = {}
for k, v in sum_over_column(
    face_genders, 'duration', ['canonical_show_id', 'channel_id', 'date'],
    probability_column='female_probability'
).items():
    screen_time_female_by_show_date[get_show_date(k)] = v
    
# Exclude hosts
face_genders_nh = face_genders.where(face_genders.host_probability < 0.5)

screen_time_male_nh_by_show_date = {}
for k, v in sum_over_column(
    face_genders_nh, 'duration', ['canonical_show_id', 'channel_id', 'date'],
    probability_column='male_probability'
).items():
    screen_time_male_nh_by_show_date[get_show_date(k)] = v

screen_time_female_nh_by_show_date = {}
for k, v in sum_over_column(
    face_genders_nh, 'duration', ['canonical_show_id', 'channel_id', 'date'],
    probability_column='female_probability'
).items():
    screen_time_female_nh_by_show_date[get_show_date(k)] = v

In [None]:
GENDER_BY_SHOW_PATH = 'widget_data/gender_by_show.json'
if not OVERWRITE and os.path.exists(GENDER_BY_SHOW_PATH):
    raise Exception('File exists!')
    
with open(GENDER_BY_SHOW_PATH, 'w') as f:
    json.dump({
        'all': {
            'male': json_keys(screen_time_male_by_show_date),
            'female': json_keys(screen_time_female_by_show_date)
        },
        'nonhost': {
            'male': json_keys(screen_time_male_nh_by_show_date),
            'female': json_keys(screen_time_female_nh_by_show_date)
        }
    }, f)

## Identity by Show

In [None]:
screen_time_identity_by_show_date = defaultdict(dict)
for k, v in sum_over_column(
    face_identities, 'duration', ['identity_id', 'canonical_show_id', 'channel_id', 'date'],
    probability_column='probability'
).items():
    if k[0] in identity_map:
        screen_time_identity_by_show_date[identity_map[k[0]]][get_show_date(k[1:])] = v

In [None]:
IDENTITY_BY_SHOW_PATH = 'widget_data/identity_by_show.json'
if not OVERWRITE and os.path.exists(IDENTITY_BY_SHOW_PATH):
    raise Exception('File exists!')
    
with open(IDENTITY_BY_SHOW_PATH, 'w') as f:
    json.dump({
        k : json_keys(v) 
        for k, v in screen_time_identity_by_show_date.items() 
    }, f)

# Topics

In [None]:
topics = [
    'terrorism', 'isis', 'syria', 'refugee',
    'collusion', 'russia',
    'shooting', 'black lives', 
    'san bernardino', 'pulse nightclub', 'vegas shooter', 'charleston church',
    'charlie hebdo', 'paris attacks',
    'trayvon martin', 'walter scott',
    'immigration', 'travel ban', 'border wall',
    'roy moore', 'harassment', 'email scandal', 'billy bush',
    'global warming', 'paris climate',
    'autism', 
    'planned parenthood', 'abortion',
    'gay marriage', 'lgbt',
    'fashion', 'wedding',
    'facebook', 
    'irs', 'taxes',
    'school', 'preschool',
    'nutrition', 'healthcare',
    'yoga', 'asthma', 'flu',
    'public transportation',
    'travel',
    'vacation',
    'wall street', 'economy', 'trade',
    'national security', 'north korea',
    'guns', 'education',
    'supreme court', 'social security',
    'racism', 'afghanistan', 'iraq',
    'england', 'europe', 'france',
    'football', 'soccer', 'fifa',
    'asia', 'africa', 'brazil',
    'mexico', 'canada', 'violence',
    'internet', 'technology', 'baseball',
    'olympics', 'iran', 'crime',
    'food', 'disease', 'cancer',
    'drugs', 'college', 'police',
    'oscars', 'rifle', 'apple',
    'blockchain', 'congress',
    'president', 'america', 'veteran',
    'music', 'film', 'dance',
    'book', 'ebola', 'census', 'cars',
    'import', 'export', 'christmas', 
    'july 4th', 'india', 'wine', 'wildfire',
    'earthquake', 'flood', 'hurricane', 'jobs',
    'luxury goods', 'golf', 'doping',
    'ipo', 'bankruptcy', 'literature',
    'millennials', 'fiction', 'greece',
    'italy', 'liberals', 'conservatives',
    'catholic', 'evangelical', 'nafta', 
    'opiod', 'cats', 'dogs', 'wildlife',
    'pets', 'democrat', 'republcian', 
    'media', 'death penalty', 'meme',
    'twitter', 'nasa', 'sports', 'gym', 'medicine',
    'affair', 'banks', 'agriculture', 'coal', 'oil',
    'renewable energy', 'google', 'gaming', 'artificial intelligence',
    'spy', 'wiretap', 'obamacare', 'obesity',
    'cyberbullying', 'netflix', 'basketball',
    'native american', 'african american', 'hispanic', 
    'tennis', 'inequality', 'foreign policy', 'charity',
    'marijuana', 'childcare', 'insurance', 'vaccine',
    'christianity', 'constitution', 'slavery', 'confederate',
    'poverty', 'homeless', 'prison', 'patents', 'teens',
    'elderly', 'stock market', 'bonds', 'consumers',
    'cybersecurity', 'illegal drugs', 'prescription drugs',
    'nato', 'freedom of speech', 'freedom of information', 'patriot act',
    'due process', 'flint water crisis', 'government shutdown'
]

SEGMENT_CACHE_PATH = '/tmp/topic_segments'

def get_segments(topic):
    cache_path = os.path.join(SEGMENT_CACHE_PATH, '{}.pkl'.format(topic))
    with open(cache_path, 'rb') as f:
        print('Loading {} segments from cache'.format(topic))
        return pickle.load(f)

topic_to_segments = { t : get_segments(t) for t in topics }
for t, s in topic_to_segments.items():
    if len(s) == 0:
        print('Warning: {} has no segments'.format(t))

## Topic Lexicons

In [None]:
topic_to_lexicon = {}
for t in topics:
    with open(os.path.join(LEXICON_CACHE_PATH, '{}.pkl'.format(t)), 'rb') as f:
        l = pickle.load(f)
        if len(l) == 0:
            print('{} has an empty lexicon'.format(t))
        else:
            topic_to_lexicon[t] = l
            
TOPIC_LEXICON_PATH = 'widget_data/topic_lexicons.json'
if not OVERWRITE and os.path.exists(TOPIC_LEXICON_PATH):
    raise Exception('File exists!')
    
with open(TOPIC_LEXICON_PATH, 'w') as f:
    json.dump(topic_to_lexicon, f)

## Gender By Topic

In [None]:
def segments_to_overlapping_face_genders(topic, segments):
    intervals_by_video = defaultdict(list)
    for video_id, _, interval, _, _ in segments:
        intervals_by_video[video_id].append(interval)
    face_genders_with_topic_overlap = annotate_interval_overlap(
        face_genders, intervals_by_video)
    face_genders_with_topic_overlap = face_genders_with_topic_overlap.where(
        face_genders_with_topic_overlap.overlap_seconds > 0)
    return face_genders_with_topic_overlap

topic_to_face_genders = { 
    t : segments_to_overlapping_face_genders(t, s) 
    for t, s in topic_to_segments.items() if len(s) > 0
}

In [None]:
topic_face_genders_all = None
for t, df in topic_to_face_genders.items():
    df = df.withColumn('topic', func.lit(t))
    if topic_face_genders_all is None:
        topic_face_genders_all = df
    else:
        topic_face_genders_all = topic_face_genders_all.union(df) 

screen_time_male_by_topic_date = {}
screen_time_female_by_topic_date = {}
screen_time_male_nh_by_topic_date = {}
screen_time_female_nh_by_topic_date = {}

# Including hosts
print('Computing male screen time by topic')
for k, v in sum_over_column(
    topic_face_genders_all, 'duration', ['topic', 'date'],
    probability_column='male_probability'
).items():
    topic, date = k
    screen_time_male_by_topic_date[(topic, date)] = v
    
print('Computing female screen time by topic')
for k, v in sum_over_column(
    topic_face_genders_all, 'duration', ['topic', 'date'],
    probability_column='female_probability'
).items():
    topic, date = k
    screen_time_female_by_topic_date[(topic, date)] = v

# Excluding hosts
topic_face_genders_nh_all = topic_face_genders_all.where(topic_face_genders_all.host_probability < 0.5)
print('Computing male (non-host) screen time by topic')
for k, v in sum_over_column(
    topic_face_genders_nh_all, 'duration', ['topic', 'date'],
    probability_column='male_probability'
).items():
    topic, date = k
    screen_time_male_nh_by_topic_date[(topic, date)] = v
    
print('Computing female (non-host) screen time by topic')
for k, v in sum_over_column(
    topic_face_genders_nh_all, 'duration', ['topic', 'date'],
    probability_column='female_probability'
).items():
    topic, date = k
    screen_time_female_nh_by_topic_date[(topic, date)] = v

In [None]:
# screen_time_male_by_topic_date = {}
# screen_time_female_by_topic_date = {}
# screen_time_male_nh_by_topic_date = {}
# screen_time_female_nh_by_topic_date = {}

# for i, topic in enumerate(topics):
#     print(i, topic)
#     topic_face_genders = topic_to_face_genders[topic]
#     for k, v in sum_over_column(
#         topic_face_genders, 'duration', ['date'],
#         probability_column='male_probability'
#     ).items():
#         screen_time_male_by_topic_date[(topic, k[0])] = v

#     for k, v in sum_over_column(
#         topic_face_genders, 'duration', ['date'],
#         probability_column='female_probability'
#     ).items():
#         screen_time_female_by_topic_date[(topic, k[0])] = v
        
#     topic_face_genders_nh = topic_face_genders.where(topic_face_genders.host_probability < 0.5)
#     for k, v in sum_over_column(
#         topic_face_genders_nh, 'duration', ['date'],
#         probability_column='male_probability'
#     ).items():
#         screen_time_male_nh_by_topic_date[(topic, k[0])] = v

#     for k, v in sum_over_column(
#         topic_face_genders_nh, 'duration', ['date'],
#         probability_column='female_probability'
#     ).items():
#         screen_time_female_nh_by_topic_date[(topic, k[0])] = v

In [None]:
GENDER_BY_TOPIC_PATH = 'widget_data/gender_by_topic.json'
if not OVERWRITE and os.path.exists(GENDER_BY_TOPIC_PATH):
    raise Exception('File exists!')

with open(GENDER_BY_TOPIC_PATH, 'w') as f:
    json.dump({
        'all': {
            'male': json_keys(screen_time_male_by_topic_date),
            'female': json_keys(screen_time_female_by_topic_date)
        },
        'nonhost': {
            'male': json_keys(screen_time_male_nh_by_topic_date),
            'female': json_keys(screen_time_female_nh_by_topic_date)
        }
    }, f)

## Identity by Topic

In [None]:
def segments_to_overlapping_face_identities(topic, segments):
    intervals_by_video = defaultdict(list)
    for video_id, _, interval, _, _ in segments:
        intervals_by_video[video_id].append(interval)
    face_identities_with_topic_overlap = annotate_interval_overlap(
        face_identities, intervals_by_video)
    face_identities_with_topic_overlap = face_identities_with_topic_overlap.where(
        face_identities_with_topic_overlap.overlap_seconds > 0)
    return face_identities_with_topic_overlap

topic_to_face_identities = { 
    t : segments_to_overlapping_face_identities(t, s) 
    for t, s in topic_to_segments.items() if len(s) > 0
}

In [None]:
# OOM on 140 GB!

# topic_face_identities_all = None
# for t, df in topic_to_face_identities.items():
#     df = df.withColumn('topic', func.lit(t))
#     if topic_face_identities_all is None:
#         topic_face_identities_all = df
#     else:
#         topic_face_identities_all = topic_face_identities_all.union(df) 

# screen_time_identity_by_topic_year_month = defaultdict(dict)
# for k, v in sum_over_column(
#     topic_face_identities_all, 'duration', ['identity_id', 'topic', 'year', 'month'],
#     probability_column='probability'
# ).items():
#     id_id, topic, year, month = k
#     if id_id in identity_map:
#         screen_time_identity_by_topic_year_month[identity_map[id_id]][(topic, year, month)] = v

In [None]:
screen_time_identity_by_topic_date = defaultdict(dict)
topics_done = set()

In [None]:
for i, topic in enumerate(sorted(topic_to_face_identities)):
    print(i, topic)
    if topic in topics_done:
        continue
    topic_face_identities = topic_to_face_identities[topic]
    for k, v in sum_over_column(
        topic_face_identities, 'duration', ['identity_id', 'date'],
        probability_column='probability'
    ).items():
        id_id, date = k
        name = identity_map[id_id]
        if id_id in identity_map:
            screen_time_identity_by_topic_date[identity_map[id_id]][(topic, date)] = v
    topics_done.add(topic)

In [None]:
IDENTITY_BY_TOPIC_PATH = 'widget_data/identity_by_topic.json'
if not OVERWRITE and os.path.exists(IDENTITY_BY_TOPIC_PATH):
    raise Exception('File exists!')
    
with open(IDENTITY_BY_TOPIC_PATH, 'w') as f:
    json.dump({
        k : json_keys(v) 
        for k, v in screen_time_identity_by_topic_date.items() 
    }, f)