<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Write-out-the-frame-table" data-toc-modified-id="Write-out-the-frame-table-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Write out the frame table</a></span></li><li><span><a href="#Write-out-the-video-table" data-toc-modified-id="Write-out-the-video-table-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Write out the video table</a></span></li><li><span><a href="#Write-out-the-identity-table" data-toc-modified-id="Write-out-the-identity-table-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Write out the identity table</a></span></li></ul></div>

In [1]:
import pyspark.sql.functions as func
from collections import defaultdict
import random
import pickle
import json
from datetime import datetime
from pytz import timezone
from pathlib import Path
from tqdm import tqdm

from esper.spark_util import *
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

WIDGET_DATA_DIR = '/app/data/widget-data'
if not os.path.exists(WIDGET_DATA_DIR):
    os.makedirs(WIDGET_DATA_DIR)

In [2]:
OVERWRITE = True

In [3]:
face_genders = get_face_genders()
face_genders = face_genders.where(
    (face_genders.labeler_id == Labeler.objects.get(name='knn-gender').id) &
    (face_genders.height >= 0.2) &
    (face_genders.in_commercial == False)
)
face_genders = face_genders.withColumn('start_time', face_genders.max_frame / face_genders.fps)
face_genders = face_genders.withColumn('end_time', face_genders.min_frame / face_genders.fps)

Constructing host probability udf...
  148 canonical shows have hosts
  channel_id=1 has 36 unique hosts
  channel_id=2 has 50 unique hosts
  channel_id=3 has 91 unique hosts


In [4]:
face_identities = get_face_identities()
face_identities = face_identities.where(
    (face_identities.in_commercial == False) &
    (face_identities.height >= 0.2)
)
face_identities = face_identities.withColumn('start_time', face_identities.max_frame / face_identities.fps)
face_identities = face_identities.withColumn('end_time', face_identities.min_frame / face_identities.fps)

In [7]:
class FrameInfo(object):
    
    def __init__(self, start, end):
        self.start = start
        self.end = end
        self.male_cnt = 0.
        self.female_cnt = 0.
        self.gender_var = 0. # SURPRISE! THEY ARE EQUAL!
        self.male_nh_cnt = 0.
        self.female_nh_cnt = 0.
        self.gender_nh_var = 0.
        self.identities = {}
    
    def add_face_gender(self, fg):
        female_prob = fg['female_probability']
        male_prob = fg['male_probability']
        nh_host_prob = 1. - fg['host_probability']
        
        # Adding indicator variables and their variances
        self.male_cnt += male_prob
        self.female_cnt += female_prob
        self.gender_var += (1. - male_prob) * male_prob
        
        self.male_nh_cnt += male_prob * nh_host_prob
        self.female_nh_cnt += female_prob * nh_host_prob
        self.gender_nh_var += (1. - male_prob) * male_prob * (nh_host_prob ** 2)
    
    def add_face_identity(self, fi):
        identity_id = fi['identity_id']
        identity_prob = fi['probability']
        if identity_id in self.identities:
            cur_cnt, cur_var = self.identities[identity_id]
        else:
            cur_cnt, cur_var = 0., 0.
        self.identities[identity_id] = (cur_cnt + identity_prob, cur_var + identity_prob * (1. - identity_prob))
        
    def get(self):
        return (
            self.start, self.end, 
            self.male_cnt,
            self.female_cnt, 
            self.gender_var, 
            self.male_nh_cnt,
            self.female_nh_cnt,
            self.gender_nh_var,
            self.identities
        )

In [8]:
video_id_to_frames_to_info = {}

fg_query = face_genders.select(
    'video_id', 'min_frame', 'start_time', 'end_time', 
    'male_probability', 'female_probability', 'host_probability'
)
# fg_query = fg_query.limit(1000)

for fg in fg_query.collect():
    video_id = fg.video_id
    frames_to_info = video_id_to_frames_to_info.get(video_id, {})
    min_frame = fg.min_frame
    if min_frame not in frames_to_info:
        frames_to_info[min_frame] = FrameInfo(fg.start_time, fg.end_time)
    frames_to_info[min_frame].add_face_gender(fg)
    video_id_to_frames_to_info[video_id] = frames_to_info

In [9]:
fi_query = face_identities.select(
    'video_id', 'min_frame', 'start_time', 'end_time',
    'identity_id', 'probability'
)
# fi_query = fi_query.limit(1000)

for fi in fi_query.collect():
    video_id = fi.video_id
    frames_to_info = video_id_to_frames_to_info.get(video_id, {})
    min_frame = fi.min_frame
    if min_frame not in frames_to_info:
        print('Weird: {} has no gender but has identities'.format(min_frame))
        frames_to_info[min_frame] = FrameInfo(fi.start_time, fi.end_time)
    frames_to_info[min_frame].add_face_identity(fi)
    video_id_to_frames_to_info[video_id] = frames_to_info

In [10]:
output_video_id_to_frames = {}
for video_id, f2i in video_id_to_frames_to_info.items():
    frame_list = []
    for min_frame, frame_info in f2i.items():
        frame_list.append((min_frame, *frame_info.get()))
    frame_list.sort(key=lambda x: x[1]) # sort by start time
    output_video_id_to_frames[video_id] = frame_list

# Write out the frame table

In [11]:
# FRAME_TABLE_PATH = os.path.join(WIDGET_DATA_DIR, 'frame_table.pkl')
# if not OVERWRITE and os.path.exists(FRAME_TABLE_PATH):
#     raise Exception('File exists!')
    
# with open(FRAME_TABLE_PATH, 'wb') as f:
#     pickle.dump(output_video_id_to_frames, f)

In [13]:
FRAME_PER_VIDEO_DIR = os.path.join(WIDGET_DATA_DIR, 'frame_table')
if not os.path.exists(FRAME_PER_VIDEO_DIR):
    os.makedirs(FRAME_PER_VIDEO_DIR)

for video_id in tqdm(output_video_id_to_frames):
    video_file_path = os.path.join(FRAME_PER_VIDEO_DIR, '{}.json'.format(video_id))
    with open(video_file_path, 'w') as f:
        json.dump(output_video_id_to_frames[video_id], f)

HBox(children=(IntProgress(value=0, max=181726), HTML(value='')))




# Write out the video table

In [3]:
from datetime import datetime
from django.db.models import F, ExpressionWrapper, FloatField

def get_video_name(p):
    return Path(p).name.split('.')[0]

UTC = timezone('UTC')
EST = timezone('EST')
DATE_FORMAT = '%Y-%m-%d'
def get_date_minute_from_name(p):
    channel, ymd, hms, _ = p.split('_', 3)
    timestamp = datetime.strptime(ymd + hms, '%Y%m%d%H%M%S')
    timestamp_est = timestamp.replace(tzinfo=UTC).astimezone(tz=EST)
    assert timestamp.hour != timestamp_est.hour
    return timestamp_est.strftime(DATE_FORMAT), timestamp_est.hour * 60 + timestamp_est.minute


video_data = []
for v in Video.objects.filter(
    duplicate=False, corrupted=False,
).values(
    'id', 'path', 'show__canonical_show__name', 'channel__name', 'num_frames', 'fps', 'width', 'height'
):
    video_name = get_video_name(v['path'])
    video_date, video_minute = get_date_minute_from_name(video_name)
    video_data.append((
        v['id'],
        video_name,
        v['show__canonical_show__name'],
        v['channel__name'],
        video_date,
        video_minute,
        v['num_frames'],
        v['fps'],
        v['width'],
        v['height']
    ))
                      
VIDEO_TABLE_PATH = os.path.join(WIDGET_DATA_DIR, 'videos.json')
if not OVERWRITE and os.path.exists(VIDEO_TABLE_PATH):
    raise Exception('File exists!')
    
with open(VIDEO_TABLE_PATH, 'w') as f:
    json.dump(video_data, f)           

# Write out the identity table

In [17]:
identity_data = [(i.id, i.name) for i in Identity.objects.all()]

IDENTITY_TABLE_PATH = os.path.join(WIDGET_DATA_DIR, 'identity.json')
if not OVERWRITE and os.path.exists(IDENTITY_TABLE_PATH):
    raise Exception('File exists!')
    
with open(IDENTITY_TABLE_PATH, 'w') as f:
    json.dump(identity_data, f)       