# We need to dump data for face genders and identities

In [6]:
import json
import os

from esper.spark_util import *

STORIES_DIR = '/app/data/stories-data/'

if not os.path.exists(STORIES_DIR):
    os.makedirs(STORIES_DIR)

IDENTS_PATH = os.path.join(STORIES_DIR, 'identities_by_video.json')
GENDERS_PATH = os.path.join(STORIES_DIR, 'genders_by_video.json')

In [2]:
# gender_data = {
#     video_id: [
#         (gender_id, start_time, is_host)
#     ]
# }

# identity_data = {
#     video_id: [
#         (ident_id, start_time)
#     ]
# }

# Genders

In [3]:
face_genders = get_face_genders()
face_genders = face_genders.where(
    (face_genders.labeler_id == Labeler.objects.get(name='knn-gender').id) &
    (face_genders.height >= 0.2) &
    (face_genders.in_commercial == False)
)
face_genders = face_genders.withColumn(
    'is_host2', face_genders.host_probability > 0.5)
face_genders = face_genders.withColumn(
    'start_time', face_genders.min_frame / face_genders.fps)

Constructing host probability udf...
  148 canonical shows have hosts
  channel_id=1 has 36 unique hosts
  channel_id=2 has 50 unique hosts
  channel_id=3 has 91 unique hosts


In [7]:
video_to_fg = {}
for fg in face_genders.select(
    'video_id', 'gender_id', 'start_time', 'is_host2'
).sort('video_id', 'start_time').collect():
    video_id = fg.video_id
    if video_id not in video_to_fg:
        video_to_fg[video_id] = []
    video_to_fg[video_id].append(
        (fg.gender_id, fg.start_time, 1 if fg.is_host2 else 0)
    )

print('Writing:', GENDERS_PATH)
with open(GENDERS_PATH, 'w') as f:
    json.dump(video_to_fg, f)
print('Done!')

# Identities

In [8]:
face_idents = get_face_identities()
face_idents = face_idents.where(
    (face_idents.height >= 0.2) &
    (face_idents.in_commercial == False)
)
face_idents = face_idents.withColumn(
    'start_time', face_idents.min_frame / face_idents.fps)
face_idents = face_idents.withColumn(
    'is_host2', face_idents.host_probability > 0.5)

In [10]:
video_to_fi = {}
for fi in face_idents.select(
    'video_id', 'identity_id', 'start_time', 'is_host2'
).sort('video_id', 'start_time').collect():
    video_id = fi.video_id
    if video_id not in video_to_fi:
        video_to_fi[video_id] = []
    video_to_fi[video_id].append(
        (fi.identity_id, fi.start_time, 1 if fi.is_host2 else 0)
    )
    
print('Writing:', IDENTS_PATH)
with open(IDENTS_PATH, 'w') as f:
    json.dump(video_to_fi, f)
print('Done!')

Writing: /app/data/stories-data/identities_by_video.json
