<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"></ul></div>

In [None]:
from esper.widget import *
from esper.prelude import *
from esper.spark_util import *
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

import pyspark.sql.functions as func
from collections import defaultdict
import random
import pickle
import json
from datetime import datetime
from pytz import timezone
from pathlib import Path

In [None]:
OVERWRITE = True

In [None]:
face_genders = get_face_genders()
face_genders = face_genders.where(face_genders.labeler_id != Labeler.objects.get(name='handlabeled-gender').id)
face_genders = face_genders.where(face_genders.in_commercial == False)

In [None]:
face_identities = get_face_identities()
face_identities = face_identities.where(face_identities.in_commercial == False)
face_identities = face_identities.where(face_identities.labeler_id.isin(
    [l.id for l in Labeler.objects.filter(name__contains='face-identity')]
))

In [None]:
class ShotInfo(object):
    
    def __init__(self, start, end):
        self.start = start
        self.end = end
        self.male_cnt = 0.
        self.female_cnt = 0.
        self.gender_var = 0. # SURPRISE! THEY ARE EQUAL!
        self.male_nh_cnt = 0.
        self.female_nh_cnt = 0.
        self.gender_nh_var = 0.
        self.identities = {}
    
    def add_face_gender(self, fg):
        female_prob = fg['female_probability']
        male_prob = fg['male_probability']
        nh_host_prob = 1. - fg['host_probability']
        
        # Adding indicator variables and their variances
        self.male_cnt += male_prob
        self.female_cnt += female_prob
        self.gender_var += (1. - male_prob) * male_prob
        
        self.male_nh_cnt += male_prob * nh_host_prob
        self.female_nh_cnt += female_prob * nh_host_prob
        self.gender_nh_var += (1. - male_prob) * male_prob * (nh_host_prob ** 2)
    
    def add_face_identity(self, fi):
        identity_id = fi['identity_id']
        identity_prob = fi['probability']
        if identity_id in self.identities:
            cur_cnt, cur_var = self.identities[identity_id]
        else:
            cur_cnt, cur_var = 0., 0.
        self.identities[identity_id] = (cur_cnt + identity_prob, cur_var + identity_prob * (1. - identity_prob))
        
    def get(self):
        return (
            self.start, self.end, 
            self.male_cnt,
            self.female_cnt, 
            self.gender_var, 
            self.male_nh_cnt,
            self.female_nh_cnt,
            self.gender_nh_var,
            self.identities
        )


In [None]:
video_id_to_shots_to_info = {}

for fg in face_genders.select('video_id', 'shot_id', 'min_frame', 'max_frame', 'fps', 
                              'male_probability', 'female_probability', 'host_probability').collect():
    video_id = fg['video_id']
    shots_to_info = video_id_to_shots_to_info.get(video_id, {})
    shot_id = fg['shot_id']
    if shot_id not in shots_to_info:
        min_frame = fg['min_frame']
        max_frame = fg['max_frame']
        fps = fg['fps']
        shots_to_info[shot_id] = ShotInfo(min_frame / fps, max_frame / fps)
    shots_to_info[shot_id].add_face_gender(fg)
    video_id_to_shots_to_info[video_id] = shots_to_info

In [None]:
for fi in face_identities.select('video_id', 'shot_id', 'min_frame', 'max_frame', 'fps', 
                                 'identity_id', 'probability').collect():
    video_id = fi['video_id']
    shots_to_info = video_id_to_shots_to_info.get(video_id, {})
    shot_id = fi['shot_id']
    if shot_id not in shots_to_info:
        print('Weird: {} has no gender but has identities'.format(shot_id))
        min_frame = fi['min_frame']
        max_frame = fi['max_frame']
        fps = fi['fps']
        shots_to_info[shot_id] = ShotInfo(min_frame / fps, max_frame / fps)
    shots_to_info[shot_id].add_face_identity(fi)
    video_id_to_shots_to_info[video_id] = shots_to_info

In [None]:
output_video_id_to_shots = {}
for video_id, s2i in video_id_to_shots_to_info.items():
    shot_list = []
    for shot_id, shot_info in s2i.items():
        shot_list.append((shot_id, *shot_info.get()))
    shot_list.sort(key=lambda x: x[1]) # sort by start time
    output_video_id_to_shots[video_id] = shot_list

In [None]:
SHOT_TABLE_PATH = 'widget_data/shot_table.pkl'
if not OVERWRITE and os.path.exists(SHOT_TABLE_PATH):
    raise Exception('File exists!')
    
with open(SHOT_TABLE_PATH, 'wb') as f:
    pickle.dump(output_video_id_to_shots, f)

In [None]:
SMALL_SHOT_TABLE_PATH = 'widget_data/shot_table.small.pkl'
if not OVERWRITE and os.path.exists(SMALL_SHOT_TABLE_PATH):
    raise Exception('File exists!')
    
with open(SMALL_SHOT_TABLE_PATH, 'wb') as f:
    small_output_video_id_to_shots = {}
    for k, v in output_video_id_to_shots.items():
        small_output_video_id_to_shots[k] = v
        if len(small_output_video_id_to_shots) > 10000:
            break
    pickle.dump(small_output_video_id_to_shots, f)
    del small_output_video_id_to_shots

In [None]:
def get_video_name(p):
    return Path(p).name.split('.')[0]

UTC = timezone('UTC')
EST = timezone('EST')
DATE_FORMAT = '%Y-%m-%d'
def get_date_hour_from_name(p):
    channel, ymd, hms, _ = p.split('_', 3)
    timestamp = datetime.strptime(ymd + hms, '%Y%m%d%H%M%S')
    timestamp_est = timestamp.replace(tzinfo=UTC).astimezone(tz=EST)
    assert timestamp.hour != timestamp_est.hour
    return timestamp_est.strftime(DATE_FORMAT), timestamp_est.hour * 60 + timestamp_est.minute


video_id_to_info = {}
for v in Video.objects.filter(
        threeyears_dataset=True
    ).values('id', 'path', 'show__canonical_show__id'):
    video_name = get_video_name(v['path'])
    video_date, video_hour = get_date_hour_from_name(video_name)
    video_id_to_info[v['id']] = (
        video_name, v['show__canonical_show__id'],
        video_date, video_hour
    ) 
                      
VIDEO_TABLE_PATH = 'widget_data/video_table.pkl'
if not OVERWRITE and os.path.exists(VIDEO_TABLE_PATH):
    raise Exception('File exists!')
    
with open(VIDEO_TABLE_PATH, 'wb') as f:
    pickle.dump(video_id_to_info, f)           

In [None]:
identity_id_to_name = {i.id : i.name for i in Identity.objects.all()}

IDENTITY_TABLE_PATH = 'widget_data/identity_table.pkl'
if not OVERWRITE and os.path.exists(IDENTITY_TABLE_PATH):
    raise Exception('File exists!')
    
with open(IDENTITY_TABLE_PATH, 'wb') as f:
    pickle.dump(identity_id_to_name, f)       

In [None]:
def get_canonical_show_info(c):
    channel_name = Video.objects.filter(show__canonical_show=c)[0].channel.name
    if c.name in MAJOR_CANONICAL_SHOWS:
        return c.name, channel_name
    else:
        return 'Other ({})'.format(channel_name), channel_name

canonical_show_id_to_info = {
    c.id : get_canonical_show_info(c) 
    for c in CanonicalShow.objects.all()
}

CANON_SHOW_TABLE_PATH = 'widget_data/canonical_show_table.pkl'
if not OVERWRITE and os.path.exists(CANON_SHOW_TABLE_PATH):
    raise Exception('File exists!')
    
with open(CANON_SHOW_TABLE_PATH, 'wb') as f:
    pickle.dump(canonical_show_id_to_info, f)       