<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><span><a href="#Write-out-the-frame-table" data-toc-modified-id="Write-out-the-frame-table-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Write out the frame table</a></span></li><li><span><a href="#Write-out-the-video-table" data-toc-modified-id="Write-out-the-video-table-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Write out the video table</a></span></li><li><span><a href="#Write-out-the-identity-table" data-toc-modified-id="Write-out-the-identity-table-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Write out the identity table</a></span></li><li><span><a href="#Write-out-interval-sets" data-toc-modified-id="Write-out-interval-sets-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Write out interval sets</a></span><ul class="toc-item"><li><span><a href="#Commercials" data-toc-modified-id="Commercials-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Commercials</a></span></li><li><span><a href="#Faces" data-toc-modified-id="Faces-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Faces</a></span></li><li><span><a href="#Identity" data-toc-modified-id="Identity-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Identity</a></span></li></ul></li></ul></div>

In [None]:
import pyspark.sql.functions as func
import os
from collections import defaultdict
import random
import pickle
import json
from pytz import timezone
from pathlib import Path
from tqdm import tqdm

from django.db.models import F, ExpressionWrapper, FloatField, IntegerField

from esper.spark_util import *
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

WIDGET_DATA_DIR = '/app/data/widget-data'
if not os.path.exists(WIDGET_DATA_DIR):
    os.makedirs(WIDGET_DATA_DIR)

In [None]:
OVERWRITE = True

In [None]:
face_genders = get_face_genders()
face_genders = face_genders.where(
    (face_genders.labeler_id == Labeler.objects.get(name='knn-gender').id) &
    (face_genders.height >= 0.2) # &
#     (face_genders.in_commercial == False)
)
face_genders = face_genders.withColumn('start_time', face_genders.min_frame / face_genders.fps)
face_genders = face_genders.withColumn('end_time', face_genders.max_frame / face_genders.fps)

In [None]:
face_identities = get_face_identities()
face_identities = face_identities.where(
#     (face_identities.in_commercial == False) &
    (face_identities.height >= 0.2)
)
face_identities = face_identities.withColumn('start_time', face_identities.min_frame / face_identities.fps)
face_identities = face_identities.withColumn('end_time', face_identities.max_frame / face_identities.fps)

In [None]:
class FrameInfo(object):
    
    def __init__(self, start, end):
        self.start = start
        self.end = end
        self.male_cnt = 0.
        self.female_cnt = 0.
        self.gender_var = 0. # SURPRISE! THEY ARE EQUAL!
        self.male_nh_cnt = 0.
        self.female_nh_cnt = 0.
        self.gender_nh_var = 0.
        self.identities = {}
    
    def add_face_gender(self, fg):
        female_prob = fg['female_probability']
        male_prob = fg['male_probability']
        nh_host_prob = 1. - fg['host_probability']
        
        # Adding indicator variables and their variances
        self.male_cnt += male_prob
        self.female_cnt += female_prob
        self.gender_var += (1. - male_prob) * male_prob
        
        self.male_nh_cnt += male_prob * nh_host_prob
        self.female_nh_cnt += female_prob * nh_host_prob
        self.gender_nh_var += (1. - male_prob) * male_prob * (nh_host_prob ** 2)
    
    def add_face_identity(self, fi):
        identity_id = fi['identity_id']
        identity_prob = fi['probability']
        if identity_id in self.identities:
            cur_cnt, cur_var = self.identities[identity_id]
        else:
            cur_cnt, cur_var = 0., 0.
        self.identities[identity_id] = (cur_cnt + identity_prob, cur_var + identity_prob * (1. - identity_prob))
        
    def get(self):
        return (
            self.start, self.end, 
            self.male_cnt,
            self.female_cnt, 
            self.gender_var, 
            self.male_nh_cnt,
            self.female_nh_cnt,
            self.gender_nh_var,
            self.identities
        )

In [None]:
video_id_to_frames_to_info = {}

fg_query = face_genders.select(
    'video_id', 'min_frame', 'start_time', 'end_time', 
    'male_probability', 'female_probability', 'host_probability'
)
# fg_query = fg_query.limit(1000)

for fg in fg_query.collect():
    video_id = fg.video_id
    frames_to_info = video_id_to_frames_to_info.get(video_id, {})
    min_frame = fg.min_frame
    if min_frame not in frames_to_info:
        frames_to_info[min_frame] = FrameInfo(fg.start_time, fg.end_time)
    frames_to_info[min_frame].add_face_gender(fg)
    video_id_to_frames_to_info[video_id] = frames_to_info

In [None]:
fi_query = face_identities.select(
    'video_id', 'min_frame', 'start_time', 'end_time',
    'identity_id', 'probability'
)
# fi_query = fi_query.limit(1000)

for fi in fi_query.collect():
    video_id = fi.video_id
    frames_to_info = video_id_to_frames_to_info.get(video_id, {})
    min_frame = fi.min_frame
    if min_frame not in frames_to_info:
        print('Weird: {} has no gender but has identities'.format(min_frame))
        frames_to_info[min_frame] = FrameInfo(fi.start_time, fi.end_time)
    frames_to_info[min_frame].add_face_identity(fi)
    video_id_to_frames_to_info[video_id] = frames_to_info

In [None]:
output_video_id_to_frames = {}
for video_id, f2i in video_id_to_frames_to_info.items():
    frame_list = []
    for min_frame, frame_info in f2i.items():
        frame_list.append((min_frame, *frame_info.get()))
    frame_list.sort(key=lambda x: x[1]) # sort by start time
    output_video_id_to_frames[video_id] = frame_list

# Write out the frame table

In [None]:
FRAME_PER_VIDEO_DIR = os.path.join(WIDGET_DATA_DIR, 'frame_table')
if not os.path.exists(FRAME_PER_VIDEO_DIR):
    os.makedirs(FRAME_PER_VIDEO_DIR)

for video_id in tqdm(output_video_id_to_frames):
    video_file_path = os.path.join(FRAME_PER_VIDEO_DIR, '{}.json'.format(video_id))
    with open(video_file_path, 'w') as f:
        json.dump(output_video_id_to_frames[video_id], f)

# Write out the video table

In [None]:
def get_video_name(p):
    return Path(p).name.split('.')[0]

UTC = timezone('UTC')
EST = timezone('EST')
DATE_FORMAT = '%Y-%m-%d'
def get_date_minute_from_name(p):
    channel, ymd, hms, _ = p.split('_', 3)
    timestamp = datetime.datetime.strptime(ymd + hms, '%Y%m%d%H%M%S')
    timestamp_est = timestamp.replace(tzinfo=UTC).astimezone(tz=EST)
    assert timestamp.hour != timestamp_est.hour
    return timestamp_est.strftime(DATE_FORMAT), timestamp_est.hour * 60 + timestamp_est.minute

video_data = []
for v in Video.objects.filter(
    duplicate=False, corrupted=False,
).values(
    'id', 'path', 'show__canonical_show__name', 'channel__name', 'num_frames', 'fps', 'width', 'height'
):
    video_name = get_video_name(v['path'])
    video_date, video_minute = get_date_minute_from_name(video_name)
    video_data.append((
        v['id'],
        video_name,
        v['show__canonical_show__name'],
        v['channel__name'],
        video_date,
        video_minute,
        v['num_frames'],
        v['fps'],
        v['width'],
        v['height']
    ))
                      
VIDEO_TABLE_PATH = os.path.join(WIDGET_DATA_DIR, 'videos.json')
if not OVERWRITE and os.path.exists(VIDEO_TABLE_PATH):
    raise Exception('File exists!')
    
with open(VIDEO_TABLE_PATH, 'w') as f:
    json.dump(video_data, f)

print('Done!')

# Write out the identity table

In [None]:
identity_data = [(i.id, i.name) for i in Identity.objects.all()]

IDENTITY_TABLE_PATH = os.path.join(WIDGET_DATA_DIR, 'identity.json')
if not OVERWRITE and os.path.exists(IDENTITY_TABLE_PATH):
    raise Exception('File exists!')
    
with open(IDENTITY_TABLE_PATH, 'w') as f:
    json.dump(identity_data, f)

print('Done!')

# Write out interval sets

In [None]:
class IntervalSetMappingWriter(object):

    def __init__(self, path):
        self._fp = open(path, 'wb')
        self._path = path

    def __enter__(self):
        return self

    def __exit__(self, type, value, tb):
        self.close()

    def __fmt_u32(self, v):
        return v.to_bytes(4, byteorder='little')

    def write(self, id_, intervals):
        self._fp.write(self.__fmt_u32(id_))
        self._fp.write(self.__fmt_u32(len(intervals)))
        for a, b in intervals:
            self._fp.write(self.__fmt_u32(a))
            self._fp.write(self.__fmt_u32(b))

    def close(self):
        if self._fp is not None:
            self._fp.close()
            self._fp = None

    @property
    def path(self):
        return self._path
    
class IntervalAccumulator(object):
    
    def __init__(self, fuzz=250):
        self._intervals = None
        self._fuzz = fuzz
        
    def add(self, start, end):
        assert start <= end
        start = max(0, start - self._fuzz)
        end += self._fuzz
        if not self._intervals:
            self._intervals = [(start, end)]
        else:
            last_int = self._intervals[-1]
            if start > last_int[1]:
                self._intervals.append((start, end))
            elif end > last_int[1]:
                assert start >= last_int[0]
                assert last_int[0] <= end
                self._intervals[-1] = (last_int[0], end)
    
    def get(self):
        return self._intervals

## Commercials

In [None]:
COMMERCIAL_INTERVAL_FILE = '/app/data/widget-data/commercials.bin'

commercials_by_video_id = defaultdict(list) 
for c in Commercial.objects.filter(
    labeler__name='haotian-commercials', 
    video__duplicate=False, video__corrupted=False
).annotate(
    start_ms=ExpressionWrapper(F('min_frame') / F('video__fps') * 1000, output_field=IntegerField()),
    end_ms=ExpressionWrapper(F('max_frame') / F('video__fps') * 1000, output_field=IntegerField())
).values('video__id', 'start_ms', 'end_ms'):
    commercials_by_video_id[c['video__id']].append((c['start_ms'], c['end_ms']))

with IntervalSetMappingWriter(
    os.path.join(COMMERCIAL_INTERVAL_FILE, COMMERCIAL_INTERVAL_FILE)
) as COMM_INTS:
    for video_id in sorted(commercials_by_video_id.keys()):
        COMM_INTS.write(video_id, list(sorted(commercials_by_video_id[video_id])))
print('Done!')

## Faces

In [None]:
FACE_INTERVAL_DIR = '/app/data/widget-data/face'
if not os.path.exists(FACE_INTERVAL_DIR):
    os.makedirs(FACE_INTERVAL_DIR)

with IntervalSetMappingWriter(
    os.path.join(FACE_INTERVAL_DIR, 'all.bin')
) as ALL_INTS, IntervalSetMappingWriter(
    os.path.join(FACE_INTERVAL_DIR, 'male.bin')
) as MALE_INTS, IntervalSetMappingWriter(
    os.path.join(FACE_INTERVAL_DIR, 'female.bin')
) as FEMALE_INTS, IntervalSetMappingWriter(
    os.path.join(FACE_INTERVAL_DIR, 'host.bin')
) as HOST_INTS, IntervalSetMappingWriter(
    os.path.join(FACE_INTERVAL_DIR, 'nonhost.bin')
) as NON_HOST_INTS, IntervalSetMappingWriter(
    os.path.join(FACE_INTERVAL_DIR, 'male_host.bin')
) as MALE_HOST_INTS, IntervalSetMappingWriter(
    os.path.join(FACE_INTERVAL_DIR, 'male_nonhost.bin')
) as MALE_NON_HOST_INTS, IntervalSetMappingWriter(
    os.path.join(FACE_INTERVAL_DIR, 'female_host.bin')
) as FEMALE_HOST_INTS, IntervalSetMappingWriter(
    os.path.join(FACE_INTERVAL_DIR, 'female_nonhost.bin')
) as FEMALE_NON_HOST_INTS:
    face_genders_int = face_genders
    face_genders_int = face_genders_int.withColumn(
        'start_ms', (face_genders_int.start_time * 1000).cast('int'))
    face_genders_int = face_genders_int.withColumn(
        'end_ms', (face_genders_int.end_time * 1000).cast('int'))
    
    # DEBUG
#     face_genders_int = face_genders_int.where(face_genders_int.video_id < 10)
    
    fg_query = face_genders_int.select(
        'video_id', 'start_ms', 'end_ms', 
        'male_probability', 'host_probability'
    ).sort('video_id', 'start_ms')

    n_videos_done = 0
    curr_video_id = None
    for fg in fg_query.collect():
        if fg.video_id != curr_video_id:
            if curr_video_id is not None:
                n_videos_done += 1
                if n_videos_done % 1000 == 0:
                    print('Processed {} videos'.format(n_videos_done))
                if all_int.get():
                    ALL_INTS.write(curr_video_id, all_int.get())
                if male_int.get():
                    MALE_INTS.write(curr_video_id, male_int.get())
                if female_int.get():
                    FEMALE_INTS.write(curr_video_id, female_int.get())
                if host_int.get():
                    HOST_INTS.write(curr_video_id, host_int.get())
                if nh_int.get():
                    NON_HOST_INTS.write(curr_video_id, nh_int.get())
                if male_host_int.get():
                    MALE_HOST_INTS.write(curr_video_id, male_host_int.get())
                if male_nh_int.get():
                    MALE_NON_HOST_INTS.write(curr_video_id, male_nh_int.get())
                if female_host_int.get():
                    FEMALE_HOST_INTS.write(curr_video_id, female_host_int.get())
                if female_nh_int.get():
                    FEMALE_NON_HOST_INTS.write(curr_video_id, female_nh_int.get())
            
            curr_video_id = fg.video_id
            all_int = IntervalAccumulator()
            male_int = IntervalAccumulator()
            female_int = IntervalAccumulator()
            host_int = IntervalAccumulator()
            nh_int = IntervalAccumulator()
            male_nh_int = IntervalAccumulator()
            female_nh_int = IntervalAccumulator()
            male_host_int = IntervalAccumulator()
            female_host_int = IntervalAccumulator()

        all_int.add(fg.start_ms, fg.end_ms)
        
        if fg.host_probability >= 0.5:
            host_int.add(fg.start_ms, fg.end_ms)
        else:
            nh_int.add(fg.start_ms, fg.end_ms)
            
        if fg.male_probability >= 0.5:
            male_int.add(fg.start_ms, fg.end_ms)
            if fg.host_probability >= 0.5:
                male_host_int.add(fg.start_ms, fg.end_ms)
            else:
                male_nh_int.add(fg.start_ms, fg.end_ms)
        else:
            female_int.add(fg.start_ms, fg.end_ms)
            if fg.host_probability >= 0.5:
                female_host_int.add(fg.start_ms, fg.end_ms)
            else:
                female_nh_int.add(fg.start_ms, fg.end_ms)
                
    if curr_video_id is not None:
        if all_int.get():
            ALL_INTS.write(curr_video_id, all_int.get())
        if male_int.get():
            MALE_INTS.write(curr_video_id, male_int.get())
        if female_int.get():
            FEMALE_INTS.write(curr_video_id, female_int.get())
        if host_int.get():
            HOST_INTS.write(curr_video_id, host_int.get())
        if nh_int.get():
            NON_HOST_INTS.write(curr_video_id, nh_int.get())
        if male_host_int.get():
            MALE_HOST_INTS.write(curr_video_id, male_host_int.get())
        if male_nh_int.get():
            MALE_NON_HOST_INTS.write(curr_video_id, male_nh_int.get())
        if female_host_int.get():
            FEMALE_HOST_INTS.write(curr_video_id, female_host_int.get())
        if female_nh_int.get():
            FEMALE_NON_HOST_INTS.write(curr_video_id, female_nh_int.get())
print('Done!')

## Identity

In [None]:
IDENTITY_INTERVAL_DIR = '/app/data/widget-data/identity'
if not os.path.exists(IDENTITY_INTERVAL_DIR):
    os.makedirs(IDENTITY_INTERVAL_DIR)

face_identities_int = face_identities.where(face_identities.probability >= 0.5)
face_identities_int = face_identities_int.withColumn(
    'start_ms', (face_identities_int.start_time * 1000).cast('int'))
face_identities_int = face_identities_int.withColumn(
    'end_ms', (face_identities_int.end_time * 1000).cast('int'))

# debug
# face_identities_int = face_identities_int.where(face_identities_int.video_id < 10)

fi_query = face_identities_int.select(
    'video_id', 'identity_id', 'start_ms', 'end_ms'
).sort('video_id', 'identity_id', 'start_ms')

identity_writers = {}
def flush_idenity_accumulators(video_id, accumulators):
    for identity_id, identity_acc in accumulators.items():
        if identity_acc.get():
            if identity_id not in identity_writers:
                identity_writers[identity_id] = IntervalSetMappingWriter(
                    os.path.join(
                        IDENTITY_INTERVAL_DIR, 
                        '{}.bin'.format(
                            Identity.objects.get(id=identity_id).name.lower()
                        )))
            identity_writers[identity_id].write(video_id, identity_acc.get())

n_videos_done = 0
curr_video_id = None
for fi in fi_query.collect():
    if fi.video_id != curr_video_id:
        if curr_video_id is not None:
            n_videos_done += 1
            if n_videos_done % 1000 == 0:
                print('Processed {} videos'.format(n_videos_done))
            flush_idenity_accumulators(curr_video_id, curr_accumulators)
                    
        curr_video_id = fi.video_id
        curr_accumulators = defaultdict(lambda: IntervalAccumulator())
    curr_accumulators[fi.identity_id].add(fi.start_ms, fi.end_ms)
    
if curr_video_id is not None:
    flush_idenity_accumulators(curr_video_id, curr_accumulators)
            
for iw in identity_writers.values():
    iw.close()
del identity_writers
print('Done!')

In [None]:
Labeler.objects.filter(name__contains='haotian')