In [None]:
import pyspark.sql.functions as func
from datetime import timedelta, datetime
from collections import OrderedDict
import random

from esper.widget import *
from esper.prelude import *
from esper.spark_util import *
from esper.validation import *
from esper.plot_util import *
from esper.spark_identity import get_screen_time_by_canonical_show_spark
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

In [None]:
faces = get_faces()
face_genders = get_face_genders()
face_genders = face_genders.where(
    face_genders.labeler_id == Labeler.objects.get(name='rudecarnie').id)
      
face_genders = face_genders.withColumn('is_host2', face_genders.host_probability > 0.5)
face_genders = face_genders.withColumn(
    'height2', func.ceil(face_genders.height * 100 / 10) * 10
)

# Old Gender Code

In [None]:
face_genders_nh = face_genders.where(face_genders.host_probability < 0.5)
screen_time_male, screen_time_female = OrderedDict(), OrderedDict()
screen_time_nh_male, screen_time_nh_female = OrderedDict(), OrderedDict()

print('Computing screen time across all channels')
screen_time_male['All Channels'] = sum_over_column(
    face_genders, 'duration', probability_column='male_probability'
)
screen_time_female['All Channels'] = sum_over_column(
    face_genders, 'duration', probability_column='female_probability'
)

print('Computing screen time across all channels (non-hosts)')
screen_time_nh_male['All Channels'] = sum_over_column(
    face_genders_nh, 'duration', probability_column='male_probability'
)
screen_time_nh_female['All Channels'] = sum_over_column(
    face_genders_nh, 'duration', probability_column='female_probability'
)

# print('Computing screen time by channel')
# channel_id_map = {c.id : c.name for c in Channel.objects.all()}
# for k, v in sum_over_column(face_genders, 'duration', ['channel_id'], 
#                             probability_column='male_probability').items():
#     screen_time_male[channel_id_map[k[0]]] = v
# for k, v in sum_over_column(face_genders, 'duration', ['channel_id'], 
#                             probability_column='female_probability').items():
#     screen_time_female[channel_id_map[k[0]]] = v
    
# print('Computing screen time by channel (non-host)')
# for k, v in sum_over_column(face_genders_nh, 'duration', ['channel_id'], 
#                             probability_column='male_probability').items():
#     screen_time_nh_male[channel_id_map[k[0]]] = v
# for k, v in sum_over_column(face_genders_nh, 'duration', ['channel_id'], 
#                             probability_column='female_probability').items():
#     screen_time_nh_female[channel_id_map[k[0]]] = v

In [None]:
# sort_order = ['All Channels'] + [c.name for c in Channel.objects.all().order_by('name')]
# plot_binary_proportion_comparison(
#     ['Male', 'Female'], [screen_time_male, screen_time_female],
#     '', '', 'Proportion of Screen Time',
#     raw_data_to_label_fn=None, figsize=(5,5), sort_order=sort_order,
#     legend_loc=4,
#     save_path='figures/gender-10y-screen-time.pdf'
# )
# plot_binary_proportion_comparison(
#     ['Male', 'Female'], [screen_time_nh_male, screen_time_nh_female],
#     '', '', 'Proportion of Screen Time (Excluding Hosts)',
#     raw_data_to_label_fn=None, figsize=(5,5), sort_order=sort_order,
#     legend_loc=4,
#     save_path='figures/gender-10y-screen-time-no-host.pdf'
# )

In [None]:
canonical_show_map = {c.id : c.name for c in CanonicalShow.objects.all() if c.name in MAJOR_CANONICAL_SHOWS}
screen_time_male_by_show, screen_time_female_by_show = {}, {}
for k, v in sum_over_column(face_genders, 'duration', ['canonical_show_id'],
                            probability_column='male_probability').items():
    if k[0] in canonical_show_map:
        screen_time_male_by_show[canonical_show_map[k[0]]] = v
for k, v in sum_over_column(face_genders, 'duration', ['canonical_show_id'], 
                            probability_column='female_probability').items():
    if k[0] in canonical_show_map:
        screen_time_female_by_show[canonical_show_map[k[0]]] = v

In [None]:
plot_binary_proportion_comparison(
    ['Male', 'Female'], [screen_time_male_by_show, screen_time_female_by_show],
    'Gender Distribution of Screen Time vs. Show', '', 'Proportion of Screen Time',
    raw_data_to_label_fn=None, legend_loc=4,
    baseline_series_names=[
        'Baseline Male (Entire Dataset)', 
        'Baseline Female (Entire Dataset)'
    ],
    baseline_data=[
        screen_time_male['All Channels'][0],
        screen_time_female['All Channels'][0]
    ],
    save_path='figures/gender-10y-screen-time-by-show.pdf',
    figsize=(30, 10)
)

In [None]:
screen_time_nh_male_by_show, screen_time_nh_female_by_show = {}, {}
for k, v in sum_over_column(face_genders_nh, 'duration', ['canonical_show_id'],
                            probability_column='male_probability').items():
    if k[0] in canonical_show_map:
        screen_time_nh_male_by_show[canonical_show_map[k[0]]] = v
for k, v in sum_over_column(face_genders_nh, 'duration', ['canonical_show_id'], 
                            probability_column='female_probability').items():
    if k[0] in canonical_show_map:
        screen_time_nh_female_by_show[canonical_show_map[k[0]]] = v

In [None]:
plot_binary_proportion_comparison(
    ['Male (Excl. Hosts)', 'Female (Excl. Hosts)'], [screen_time_nh_male_by_show, screen_time_nh_female_by_show],
    'Gender Distribution of Screen Time (Excluding Hosts) vs. Show', '', 'Proportion of Screen Time',
    tertiary_series_names=['Male (Incl. Hosts)', 'Female (Incl. Hosts)'],
    tertiary_data=[screen_time_male_by_show, screen_time_female_by_show],
#     baseline_data=[
#         screen_time_male['All Channels'][0],
#         screen_time_female['All Channels'][0]
#     ],
#     baseline_series_names=[
#         'Baseline Male (Entire Dataset)', 
#         'Baseline Female (Entire Dataset)'
#     ],
    raw_data_to_label_fn=None, legend_loc=4,
    save_path='figures/gender-10y-screen-time-by-show-no-host.pdf',
    figsize=(30, 10)
)

# Save Gender CSVs

In [None]:
channel_id_to_name = {c.id: c.name for c in Channel.objects.all()}
cshow_id_to_name = {c.id: c.name for c in CanonicalShow.objects.all()}
video_id_to_path = {v.id: v.path for v in Video.objects.all()}
color_id_to_name = {c.id: c.name for c in HairColorName.objects.all()}
clothing_id_to_name = {c.id: c.name for c in ClothingName.objects.all()}

def get_date_hour_from_path(t):
    tokens = t.split('_')
    yyyymmdd = tokens[1]
    hhmmss = tokens[2]
    hour = int(hhmmss[:2])
    date = yyyymmdd[:4] + '-' + yyyymmdd[4:6] + '-' + yyyymmdd[6:]
    return date, hour

def get_isoweekday(date):
    dt = datetime.strptime(date, '%Y-%m-%d')
    return dt.isoweekday()

assert get_isoweekday('2018-09-07') == 5
assert get_date_hour_from_path('tvnews/videos/MSNBCW_20180821_230000_Hardball_With_Chris_Matthews.mp4') == ('2018-08-21', 23)

In [None]:
print('Summing over male faces')
male_st = sum_over_column(
    face_genders, 'duration',
    ['video_id', 'channel_id', 'canonical_show_id', 'in_commercial', 'is_host2', 'height2', 
     'haircolor_id', 'clothing_id'],
    probability_column='male_probability'
)

print('Summing over female faces')
female_st = sum_over_column(
    face_genders, 'duration',
    ['video_id', 'channel_id', 'canonical_show_id', 'in_commercial', 'is_host2', 'height2', 
     'haircolor_id', 'clothing_id'],
    probability_column='female_probability'
)

In [None]:
gender_outfile = '/app/data/gender_10y.csv'

with open(gender_outfile, 'w') as f:
    f.write(','.join([
        'gender', 
        'video_id',
#         'video_path',
        'channel',
        'canonical_show',
        'date',
        'hour',
        'isoweekday', 
        'in_commercial', 
        'is_host',
        'bbox_height_leq',
        'hair_color',
        'clothing',
        'screentime_seconds',
        'variance_seconds_sq'
    ]))
    f.write('\r\n')
    
    def write_row(g, k, v):
        video_id, channel_id, cshow_id, in_comm, is_host, bbox_height, color_id, clothing_id = k
        video_path = video_id_to_path[video_id]
        date, hour = get_date_hour_from_path(video_path)
        week_day = get_isoweekday(date)
        row = [
            g,
            str(video_id), 
#             video_path, 
            channel_id_to_name[channel_id], 
            cshow_id_to_name[cshow_id],
            date,
            str(hour),
            str(week_day), 
            str(in_comm), 
            str(is_host),
            str(bbox_height),
            color_id_to_name.get(color_id, ''),
            clothing_id_to_name.get(clothing_id, ''),
            str(v[0]),
            str(v[1])
        ]
        f.write(','.join(row))
        f.write('\r\n')
    
    def sort_fn(k):
        video_id, channel_id, cshow_id, in_comm, is_host, bbox_height, color_id, clothing_id = k
        video_path = video_id_to_path[video_id]
        date, hour = get_date_hour_from_path(video_path)
        return date, hour, video_id

    row_count = 0
    for k in sorted(set(female_st) | set(male_st), key=sort_fn):
        if k in male_st:
            write_row('m', k, male_st[k])
        if k in female_st:
            write_row('f', k, female_st[k])
        row_count += 1
    
    print('Wrote {} rows'.format(row_count))

# Identities

In [None]:
face_idents = get_face_identities(include_name=True)
# face_idents = face_idents.withColumn('date', func.date_format('time', 'yyyy-MM-dd'))
# face_idents = face_idents.withColumn('is_host2', face_idents.host_probability > 0.5)
face_idents = face_idents.withColumn(
    'height2', func.ceil(face_idents.height * 100 / 10) * 10)

ident_st = sum_over_column(
    face_idents, 'duration', 
    ['name', 'video_id', 'channel_id', 'canonical_show_id', 'in_commercial', 'height2',
     'haircolor_id', 'clothing_id'],
    probability_column='probability'
)

In [None]:
ident_outfile = '/app/data/identity_10y.csv'

with open(ident_outfile, 'w') as f:
    f.write(','.join([
        'name',
        'video_id',
#         'video_path',
        'channel', 
        'canonical_show', 
        'date', 
        'hour',
        'isoweekday', 
        'in_commercial',
        'hair_color',
        'clothing',
        'bbox_height_leq',
        'screentime_seconds', 
        'variance_seconds_sq'
    ]))
    f.write('\r\n')
    
    def write_row(k, v):
        name, video_id, channel_id, cshow_id, in_comm, bbox_height, color_id, clothing_id = k
        video_path = video_id_to_path[video_id]
        date, hour = get_date_hour_from_path(video_path)
        week_day = get_isoweekday(date)
        row = [
            name,
            str(video_id),
#             video_path,
            channel_id_to_name[channel_id], 
            cshow_id_to_name[cshow_id],
            date, 
            str(hour),
            str(week_day),
            str(in_comm),
            str(bbox_height),
            color_id_to_name.get(color_id, ''),
            clothing_id_to_name.get(clothing_id, ''),
            str(v[0]),
            str(v[1])
        ]
        f.write(','.join(row))
        f.write('\r\n')
    
    def sort_fn(k):
        name, video_id, channel_id, cshow_id, in_comm, bbox_height, color_id, clothing_id = k
        video_path = video_id_to_path[video_id]
        date, hour = get_date_hour_from_path(video_path)
        return name, date, video_path
    
    print('Summing over identities')
    row_count = 0
    for k in sorted(ident_st, key=sort_fn):
        write_row(k, ident_st[k])
        row_count += 1
    
    print('Wrote {} rows'.format(row_count))

# Videos

In [None]:
print('Computing video id to commercials')
VIDEO_ID_TO_COMMERCIALS = defaultdict(list)
for c in get_commercials().select('video_id', 'min_frame', 'max_frame').collect():
    VIDEO_ID_TO_COMMERCIALS[c.video_id].append(
        (c.min_frame, c.max_frame)
    )
    
def _annotate_frame_in_commercial(df):
    assert ('video_id' in df.columns)
    assert ('number' in df.columns)
    def in_commercial_helper(video_id, frame_no):
        if video_id in VIDEO_ID_TO_COMMERCIALS:
            for c_min, c_max in VIDEO_ID_TO_COMMERCIALS[video_id]:
                if frame_no >= c_min and frame_no <= c_max:
                    return True
        return False
    my_udf = func.udf(in_commercial_helper, BooleanType())
    df = df.withColumn(
        'in_commercial', my_udf('video_id', 'number')
    )
    return df

print('Computing frames with host')
if True:
    frames_w_host_df = faces.where(
        faces.host_probability > 0.5
    ).select('frame_id').distinct()
    spark.save('frames_w_host', frames_w_host_df)

print('Computing frames to face count')
if True:
    frames_face_count_df = faces.groupBy(
        'frame_id'
    ).agg(
        func.count(func.lit(1)).alias('face_count')
    )
    spark.save('frames_face_count', frames_face_count_df)

print('Done')

In [None]:
videos = get_videos()
videos = videos.where((videos.duplicate == False) & (videos.corrupted == False))
frames = get_frames()
frames_with_host = spark.load('frames_w_host').withColumn('has_host', func.lit(True))
frames_face_count = spark.load('frames_face_count')
frames = frames.join(
    videos, frames.video_id == videos.id
).join(
    frames_with_host, frames.id == frames_with_host.frame_id, 'left_outer'
).join(
    frames_face_count, frames.id == frames_face_count.frame_id, 'left_outer'
).select(
    'frames.*',
    videos.channel_id,
    videos.canonical_show_id,
    frames_with_host.has_host,
    frames_face_count.face_count
).where(
    ((videos.threeyears_dataset == True) & (frames.number % func.floor(videos.fps * 3) == 0)) | \
    ((videos.threeyears_dataset == False) & (frames.number % func.ceil(videos.fps * 3) == 0))
)
frames = _annotate_frame_in_commercial(frames)

In [None]:
video_outfile = '/app/data/video_10y.csv'

print('Summing over frames')
with open(video_outfile, 'w') as f:
    f.write(','.join([
        'video_id',
#         'video_path',
        'channel', 
        'canonical_show', 
        'date', 
        'hour',
        'isoweekday', 
        'in_commercial', 
        'host_onscreen',
        'num_faces_onscreen',
        'screentime_seconds'
    ]))
    f.write('\r\n')
    
    row_count = 0
    for row in frames.groupBy(
        'video_id', 'channel_id', 'canonical_show_id', 'in_commercial', 'has_host', 'face_count'
    ).agg(
        func.count(func.lit(1)).alias('frame_count')
    ).sort(
        'channel_id', 'canonical_show_id', 'video_id', 'in_commercial', 'has_host', 'face_count'
    ).collect():
        video_id = row.video_id
        channel_id = row.channel_id
        cshow_id = row.canonical_show_id
        in_comm = row.in_commercial
        has_host = row.has_host == True
        face_count = row.face_count
        if face_count is None:
            face_count = 0
        video_path = video_id_to_path[video_id]
        date, hour = get_date_hour_from_path(video_path)
        week_day = get_isoweekday(date)
        f.write(','.join([
            str(video_id),
#             video_path,
            channel_id_to_name[channel_id], 
            cshow_id_to_name[cshow_id],
            date, 
            str(hour),
            str(week_day),
            str(in_comm), 
            str(has_host),
            str(face_count),
            str(row.frame_count * 3)
        ]))
        f.write('\r\n')
        row_count += 1
    print('Wrote {} rows'.format(row_count))