In [None]:
import pyspark.sql.functions as func
from datetime import timedelta, datetime
from collections import OrderedDict
import random

from esper.widget import *
from esper.prelude import *
from esper.spark_util import *
from esper.validation import *
from esper.plot_util import *
from esper.spark_identity import get_screen_time_by_canonical_show_spark
from esper.major_canonical_shows import MAJOR_CANONICAL_SHOWS

In [None]:
black_faces = spark.load_csv('/app/data/black_face_ids.csv')
black_faces = black_faces.withColumn('is_black', black_faces.score >= 0.3)

In [None]:
from datetime import tzinfo, timedelta, datetime, timezone
import copy

ZERO = timedelta(0)
HOUR = timedelta(hours=1)
SECOND = timedelta(seconds=1)

# A class capturing the platform's idea of local time.
# (May result in wrong values on historical times in
#  timezones where UTC offset and/or the DST rules had
#  changed in the past.)
import time as _time

STDOFFSET = timedelta(seconds = -_time.timezone)
if _time.daylight:
    DSTOFFSET = timedelta(seconds = -_time.altzone)
else:
    DSTOFFSET = STDOFFSET

DSTDIFF = DSTOFFSET - STDOFFSET

# A complete implementation of current DST rules for major US time zones.

def first_sunday_on_or_after(dt):
    days_to_go = 6 - dt.weekday()
    if days_to_go:
        dt += timedelta(days_to_go)
    return dt


# US DST Rules
#
# This is a simplified (i.e., wrong for a few cases) set of rules for US
# DST start and end times. For a complete and up-to-date set of DST rules
# and timezone definitions, visit the Olson Database (or try pytz):
# http://www.twinsun.com/tz/tz-link.htm
# http://sourceforge.net/projects/pytz/ (might not be up-to-date)
#
# In the US, since 2007, DST starts at 2am (standard time) on the second
# Sunday in March, which is the first Sunday on or after Mar 8.
DSTSTART_2007 = datetime(1, 3, 8, 2)
# and ends at 2am (DST time) on the first Sunday of Nov.
DSTEND_2007 = datetime(1, 11, 1, 2)
# From 1987 to 2006, DST used to start at 2am (standard time) on the first
# Sunday in April and to end at 2am (DST time) on the last
# Sunday of October, which is the first Sunday on or after Oct 25.
DSTSTART_1987_2006 = datetime(1, 4, 1, 2)
DSTEND_1987_2006 = datetime(1, 10, 25, 2)
# From 1967 to 1986, DST used to start at 2am (standard time) on the last
# Sunday in April (the one on or after April 24) and to end at 2am (DST time)
# on the last Sunday of October, which is the first Sunday
# on or after Oct 25.
DSTSTART_1967_1986 = datetime(1, 4, 24, 2)
DSTEND_1967_1986 = DSTEND_1987_2006

def us_dst_range(year):
    # Find start and end times for US DST. For years before 1967, return
    # start = end for no DST.
    if 2006 < year:
        dststart, dstend = DSTSTART_2007, DSTEND_2007
    elif 1986 < year < 2007:
        dststart, dstend = DSTSTART_1987_2006, DSTEND_1987_2006
    elif 1966 < year < 1987:
        dststart, dstend = DSTSTART_1967_1986, DSTEND_1967_1986
    else:
        return (datetime(year, 1, 1), ) * 2

    start = first_sunday_on_or_after(dststart.replace(year=year))
    end = first_sunday_on_or_after(dstend.replace(year=year))
    return start, end


class USTimeZone(tzinfo):

    def __init__(self, hours, reprname, stdname, dstname):
        self.stdoffset = timedelta(hours=hours)
        self.reprname = reprname
        self.stdname = stdname
        self.dstname = dstname

    def __repr__(self):
        return self.reprname

    def tzname(self, dt):
        if self.dst(dt):
            return self.dstname
        else:
            return self.stdname

    def utcoffset(self, dt):
        return self.stdoffset + self.dst(dt)

    def dst(self, dt):
        if dt is None or dt.tzinfo is None:
            # An exception may be sensible here, in one or both cases.
            # It depends on how you want to treat them.  The default
            # fromutc() implementation (called by the default astimezone()
            # implementation) passes a datetime with dt.tzinfo is self.
            return ZERO
        assert dt.tzinfo is self
        start, end = us_dst_range(dt.year)
        # Can't compare naive to aware objects, so strip the timezone from
        # dt first.
        dt = dt.replace(tzinfo=None)
        if start + HOUR <= dt < end - HOUR:
            # DST is in effect.
            return HOUR
        if end - HOUR <= dt < end:
            # Fold (an ambiguous hour): use dt.fold to disambiguate.
            return ZERO if dt.fold else HOUR
        if start <= dt < start + HOUR:
            # Gap (a non-existent hour): reverse the fold rule.
            return HOUR if dt.fold else ZERO
        # DST is off.
        return ZERO

    def fromutc(self, dt):
        assert dt.tzinfo is self
        start, end = us_dst_range(dt.year)
        start = start.replace(tzinfo=self)
        end = end.replace(tzinfo=self)
        std_time = dt + self.stdoffset
        dst_time = std_time + HOUR
        if end <= dst_time < end + HOUR:
            # Repeated hour
            new_time = copy.copy(std_time)
            return new_time #std_time.replace(fold=1)
        if std_time < start or dst_time >= end:
            # Standard time
            return std_time
        if start <= std_time < end - HOUR:
            # Daylight saving time
            return dst_time


Eastern  = USTimeZone(-5, "Eastern",  "EST", "EDT")
Central  = USTimeZone(-6, "Central",  "CST", "CDT")
Mountain = USTimeZone(-7, "Mountain", "MST", "MDT")
Pacific  = USTimeZone(-8, "Pacific",  "PST", "PDT")

In [None]:
channel_id_to_name = {c.id: c.name for c in Channel.objects.all()}
cshow_id_to_name = {c.id: c.name for c in CanonicalShow.objects.all()}
video_id_to_path = {v.id: v.path for v in Video.objects.all()}
# color_id_to_name = {c.id: c.name for c in HairColorName.objects.all()}
color_id_to_name = {1: 'blond', 2: 'dark', 3: 'white', 4: 'dark', 5: 'dark', 6: ''}
length_id_to_name = {c.id: c.name for c in HairLengthName.objects.all()}
clothing_id_to_name = {c.id: c.name for c in ClothingName.objects.all()}
gender_id_to_name = {g.id: g.name for g in Gender.objects.all()}

channel_id_to_host_ids = defaultdict(set)
for v in Video.objects.distinct('show__canonical_show'):
    channel_id_to_host_ids[v.channel.id].update([
        h.name for h in v.show.canonical_show.hosts.all()
    ])

def get_identity_is_host(channel_id, ident_name):
    assert channel_id in channel_id_to_host_ids
    return ident_name in channel_id_to_host_ids[channel_id]

def get_date_hour_from_path(t):
    tokens = t.split('_')
    yyyymmdd = tokens[1]
    hhmmss = tokens[2]
    gmt = datetime(int(yyyymmdd[:4]), int(yyyymmdd[4:6]), int(yyyymmdd[6:]), \
                           int(hhmmss[:2]), tzinfo=timezone.utc)
    est = gmt.astimezone(Eastern)
    date = '{:04}-{:02}-{:02}'.format(est.year, est.month, est.day)
    hour = est.hour
    return date, hour

def get_isoweekday(date):
    dt = datetime.strptime(date, '%Y-%m-%d')
    return dt.isoweekday()

def get_height_udf(sat, max_val=100):
    def height_udf(h):
        return h if h < sat else max_val
    return func.udf(height_udf, IntegerType())

assert get_isoweekday('2018-09-07') == 5
assert get_date_hour_from_path('tvnews/videos/MSNBCW_20190109_020000_Hardball_With_Chris_Matthews.mp4') == ('2019-01-08', 21)

In [None]:
face_genders = get_face_genders()
face_genders = face_genders.where(
    face_genders.labeler_id == Labeler.objects.get(name='knn-gender').id)
      
face_genders = face_genders.withColumn('is_host2', face_genders.host_probability > 0.5)
face_genders = face_genders.withColumn('height_gte20', face_genders.height >= 0.2)

face_genders = face_genders.join(
    black_faces, black_faces.face_id == face_genders.face_id, 'left_outer'
).select(
    *[c if c != 'face_id' else 'face_genders.face_id' for c in face_genders.columns], 
    black_faces.is_black
)
face_genders = face_genders.na.fill({'is_black': False})

# Old Gender Code

In [None]:
# face_genders_nh = face_genders.where(face_genders.host_probability < 0.5)
# screen_time_male, screen_time_female = OrderedDict(), OrderedDict()
# screen_time_nh_male, screen_time_nh_female = OrderedDict(), OrderedDict()

# print('Computing screen time across all channels')
# screen_time_male['All Channels'] = sum_over_column(
#     face_genders, 'duration', probability_column='male_probability'
# )
# screen_time_female['All Channels'] = sum_over_column(
#     face_genders, 'duration', probability_column='female_probability'
# )

# print('Computing screen time across all channels (non-hosts)')
# screen_time_nh_male['All Channels'] = sum_over_column(
#     face_genders_nh, 'duration', probability_column='male_probability'
# )
# screen_time_nh_female['All Channels'] = sum_over_column(
#     face_genders_nh, 'duration', probability_column='female_probability'
# )

# print('Computing screen time by channel')
# channel_id_map = {c.id : c.name for c in Channel.objects.all()}
# for k, v in sum_over_column(face_genders, 'duration', ['channel_id'], 
#                             probability_column='male_probability').items():
#     screen_time_male[channel_id_map[k[0]]] = v
# for k, v in sum_over_column(face_genders, 'duration', ['channel_id'], 
#                             probability_column='female_probability').items():
#     screen_time_female[channel_id_map[k[0]]] = v
    
# print('Computing screen time by channel (non-host)')
# for k, v in sum_over_column(face_genders_nh, 'duration', ['channel_id'], 
#                             probability_column='male_probability').items():
#     screen_time_nh_male[channel_id_map[k[0]]] = v
# for k, v in sum_over_column(face_genders_nh, 'duration', ['channel_id'], 
#                             probability_column='female_probability').items():
#     screen_time_nh_female[channel_id_map[k[0]]] = v

In [None]:
# sort_order = ['All Channels'] + [c.name for c in Channel.objects.all().order_by('name')]
# plot_binary_proportion_comparison(
#     ['Male', 'Female'], [screen_time_male, screen_time_female],
#     '', '', 'Proportion of Screen Time',
#     raw_data_to_label_fn=None, figsize=(5,5), sort_order=sort_order,
#     legend_loc=4,
#     save_path='figures/gender-10y-screen-time.pdf'
# )
# plot_binary_proportion_comparison(
#     ['Male', 'Female'], [screen_time_nh_male, screen_time_nh_female],
#     '', '', 'Proportion of Screen Time (Excluding Hosts)',
#     raw_data_to_label_fn=None, figsize=(5,5), sort_order=sort_order,
#     legend_loc=4,
#     save_path='figures/gender-10y-screen-time-no-host.pdf'
# )

In [None]:
# canonical_show_map = {c.id : c.name for c in CanonicalShow.objects.all() if c.name in MAJOR_CANONICAL_SHOWS}
# screen_time_male_by_show, screen_time_female_by_show = {}, {}
# for k, v in sum_over_column(face_genders, 'duration', ['canonical_show_id'],
#                             probability_column='male_probability').items():
#     if k[0] in canonical_show_map:
#         screen_time_male_by_show[canonical_show_map[k[0]]] = v
# for k, v in sum_over_column(face_genders, 'duration', ['canonical_show_id'], 
#                             probability_column='female_probability').items():
#     if k[0] in canonical_show_map:
#         screen_time_female_by_show[canonical_show_map[k[0]]] = v

In [None]:
# plot_binary_proportion_comparison(
#     ['Male', 'Female'], [screen_time_male_by_show, screen_time_female_by_show],
#     'Gender Distribution of Screen Time vs. Show', '', 'Proportion of Screen Time',
#     raw_data_to_label_fn=None, legend_loc=4,
#     baseline_series_names=[
#         'Baseline Male (Entire Dataset)', 
#         'Baseline Female (Entire Dataset)'
#     ],
#     baseline_data=[
#         screen_time_male['All Channels'][0],
#         screen_time_female['All Channels'][0]
#     ],
#     save_path='figures/gender-10y-screen-time-by-show.pdf',
#     figsize=(30, 10)
# )

In [None]:
# screen_time_nh_male_by_show, screen_time_nh_female_by_show = {}, {}
# for k, v in sum_over_column(face_genders_nh, 'duration', ['canonical_show_id'],
#                             probability_column='male_probability').items():
#     if k[0] in canonical_show_map:
#         screen_time_nh_male_by_show[canonical_show_map[k[0]]] = v
# for k, v in sum_over_column(face_genders_nh, 'duration', ['canonical_show_id'], 
#                             probability_column='female_probability').items():
#     if k[0] in canonical_show_map:
#         screen_time_nh_female_by_show[canonical_show_map[k[0]]] = v

In [None]:
# plot_binary_proportion_comparison(
#     ['Male (Excl. Hosts)', 'Female (Excl. Hosts)'], [screen_time_nh_male_by_show, screen_time_nh_female_by_show],
#     'Gender Distribution of Screen Time (Excluding Hosts) vs. Show', '', 'Proportion of Screen Time',
#     tertiary_series_names=['Male (Incl. Hosts)', 'Female (Incl. Hosts)'],
#     tertiary_data=[screen_time_male_by_show, screen_time_female_by_show],
# #     baseline_data=[
# #         screen_time_male['All Channels'][0],
# #         screen_time_female['All Channels'][0]
# #     ],
# #     baseline_series_names=[
# #         'Baseline Male (Entire Dataset)', 
# #         'Baseline Female (Entire Dataset)'
# #     ],
#     raw_data_to_label_fn=None, legend_loc=4,
#     save_path='figures/gender-10y-screen-time-by-show-no-host.pdf',
#     figsize=(30, 10)
# )

# Save Gender CSVs

In [None]:
gender_group_keys = [
    'video_id', 
    'channel_id', 
    'canonical_show_id', 
    'in_commercial', 
    'is_host2', 
    'height_gte20', 
    'haircolor_id', 
#     'clothing_id', 
#     'hairlength_id', 
    'is_black'
]

print('Summing over male faces')
male_st = sum_over_column(
    face_genders, 'duration', gender_group_keys,
    probability_column='male_probability')

print('Summing over female faces')
female_st = sum_over_column(
    face_genders, 'duration', gender_group_keys,
    probability_column='female_probability')
print('Done!')

In [None]:
gender_outfile = '/app/data/gender_10y.csv'

with open(gender_outfile, 'w') as f:
    f.write(','.join([
        'gender', 
        'video_id',
#         'video_path',
        'channel',
        'canonical_show',
        'date',
        'hour',
        'isoweekday', 
        'in_commercial', 
        'is_host',
        'height_gte20',
        'hair_color',
#         'hair_length',
#         'clothing',
        'is_black',
        'screentime_seconds',
        'variance_seconds_sq'
    ]))
    f.write('\r\n')
    
    def write_row(g, k, v):
        (
            video_id, 
            channel_id, 
            cshow_id, 
            in_comm, 
            is_host, 
            bbox_height, 
            color_id, 
#             clothing_id, 
#             length_id, 
            is_black
        ) = k
        video_path = video_id_to_path[video_id]
        date, hour = get_date_hour_from_path(video_path)
        week_day = get_isoweekday(date)
        row = [
            g,
            str(video_id),
            channel_id_to_name[channel_id], 
            cshow_id_to_name[cshow_id],
            date,
            str(hour),
            str(week_day), 
            str(in_comm), 
            str(is_host),
            str(bbox_height),
            color_id_to_name.get(color_id, ''),
#             length_id_to_name.get(length_id, ''),
#             clothing_id_to_name.get(clothing_id, ''),
            str(is_black),
            str(v[0]),
            str(v[1])
        ]
        f.write(','.join(row))
        f.write('\r\n')

    row_count = 0
    total_rows = len(female_st) + len(male_st)
    for k in male_st:
        write_row('m', k, male_st[k])
        row_count += 1
        if row_count % 10000 == 0:
            print('  {} / {}'.format(row_count, total_rows))

    for k in female_st:
        write_row('f', k, female_st[k])        
        row_count += 1
        if row_count % 10000 == 0:
            print('  {} / {}'.format(row_count, total_rows))
    
    print('Wrote {} rows'.format(row_count))

# Identities

In [None]:
face_idents = get_face_identities(include_name=True)
face_genders_basic = spark.load('query_facegender')
face_genders_basic = face_genders_basic.where(
    face_genders_basic.labeler_id == Labeler.objects.get(name='knn-gender').id)
face_idents = face_idents.join(
    face_genders_basic.select('face_id', 'gender_id'), 
    face_idents.face_id == face_genders_basic.face_id, 'left_outer'
).select(
    *[c if c != 'face_id' else 'face_identities.face_id' for c in face_idents.columns], 
    face_genders_basic.gender_id
)
face_idents = face_idents.join(
    black_faces, black_faces.face_id == face_idents.face_id, 'left_outer'
).select(
    *[c if c != 'face_id' else 'face_identities.face_id' for c in face_idents.columns], 
    black_faces.is_black
)
face_idents = face_idents.na.fill({'is_black': False})
face_idents = face_idents.withColumn('height_gte20', face_idents.height >= 0.2)

ident_group_keys = [
    'name', 
    'video_id', 
    'channel_id', 
    'canonical_show_id', 
    'in_commercial', 
    'height_gte20',
    'haircolor_id', 
#     'hairlength_id', 
#     'clothing_id', 
    'gender_id',
    'is_black'
]

print('Summing over identities')
ident_st = sum_over_column(
    face_idents, 'duration', ident_group_keys,
    probability_column='probability')
print('Done!')

In [None]:
print('Computing true genders')
def ident_to_true_gender(ident_st):
    ident_mf_diff = {}
    for k in ident_st:
        name = k[0]
        gender = gender_id_to_name.get(k[-2], '')
        if name not in ident_mf_diff:
            ident_mf_diff[name] = 0.
        if gender == 'M':
            ident_mf_diff[name] += ident_st[k][0]
        elif gender == 'F':
            ident_mf_diff[name] -= ident_st[k][0]
    return {k: 'M' if v >= 0 else 'F' for k, v in ident_mf_diff.items()}
name_to_true_gender = ident_to_true_gender(ident_st)

print('Computing true is_black')
def ident_to_true_is_black(ident_st):
    ident_is_black_diff = {}
    for k in ident_st:
        name = k[0]
        is_black = k[-1]
        if name not in ident_is_black_diff:
            ident_is_black_diff[name] = 0.
        if is_black:
            ident_is_black_diff[name] += ident_st[k][0]
        else:
            ident_is_black_diff[name] -= ident_st[k][0]
    return {k: v >= 0 for k, v in ident_is_black_diff.items()}
name_to_true_is_black = ident_to_true_is_black(ident_st)
print('Done')

In [None]:
ident_outfile = '/app/data/identity_10y.csv'

with open(ident_outfile, 'w') as f:
    f.write(','.join([
        'name',
        'video_id',
#         'video_path',
        'channel', 
        'canonical_show', 
        'date', 
        'hour',
        'isoweekday', 
        'in_commercial',
        'height_gte20',
        'hair_color',
#         'hair_length',
#         'clothing',
        'pred_gender',
        'true_gender',
        'pred_is_black',
        'true_is_black',
        'is_host',
        'screentime_seconds', 
        'variance_seconds_sq'
    ]))
    f.write('\r\n')
    
    def write_row(k, v):
        (
            name, 
            video_id,
            channel_id, 
            cshow_id, 
            in_comm, 
            bbox_height, 
            color_id, 
#             length_id, 
#             clothing_id, 
            gender_id,
            is_black,
        ) = k
        video_path = video_id_to_path[video_id]
        date, hour = get_date_hour_from_path(video_path)
        week_day = get_isoweekday(date)
        row = [
            name,
            str(video_id),
            channel_id_to_name[channel_id], 
            cshow_id_to_name[cshow_id],
            date, 
            str(hour),
            str(week_day),
            str(in_comm),
            str(bbox_height),
            color_id_to_name.get(color_id, ''),
#             length_id_to_name.get(length_id, ''),
#             clothing_id_to_name.get(clothing_id, ''),
            gender_id_to_name.get(gender_id, ''),
            name_to_true_gender.get(name, ''),
            str(is_black),
            str(name_to_true_is_black.get(name, '')),
            str(get_identity_is_host(channel_id, name)),
            str(v[0]),
            str(v[1])
        ]
        f.write(','.join(row))
        f.write('\r\n')
    
    row_count = 0
    for k in ident_st:
        write_row(k, ident_st[k])
        row_count += 1
        if row_count % 10000 == 0:
            print('  {} / {}'.format(row_count, len(ident_st)))
    
    print('Wrote {} rows'.format(row_count))

# Videos

In [None]:
print('Computing video id to commercials')
VIDEO_ID_TO_COMMERCIALS = defaultdict(list)
for c in get_commercials().select('video_id', 'min_frame', 'max_frame').collect():
    VIDEO_ID_TO_COMMERCIALS[c.video_id].append(
        (c.min_frame, c.max_frame)
    )
    
def _annotate_frame_in_commercial(df):
    assert ('video_id' in df.columns)
    assert ('number' in df.columns)
    def in_commercial_helper(video_id, frame_no):
        if video_id in VIDEO_ID_TO_COMMERCIALS:
            for c_min, c_max in VIDEO_ID_TO_COMMERCIALS[video_id]:
                if frame_no >= c_min and frame_no <= c_max:
                    return True
        return False
    my_udf = func.udf(in_commercial_helper, BooleanType())
    df = df.withColumn(
        'in_commercial', my_udf('video_id', 'number')
    )
    return df

print('Computing frames with host')
if True:
    faces = get_faces()
    frames_w_host_df = faces.where(
        faces.host_probability > 0.5
    ).select('frame_id').distinct()
    spark.save('frames_w_host', frames_w_host_df)

print('Computing frames to face count')
if True:
    frames_face_count_df = faces.groupBy(
        'frame_id'
    ).agg(
        func.count(func.lit(1)).alias('face_count')
    )
    spark.save('frames_face_count', frames_face_count_df)

print('Done')

In [None]:
videos = get_videos()
videos = videos.where((videos.duplicate == False) & (videos.corrupted == False))
frames = get_frames()
frames_with_host = spark.load('frames_w_host').withColumn('has_host', func.lit(True))
frames_face_count = spark.load('frames_face_count')
frames = frames.join(
    videos, frames.video_id == videos.id
).join(
    frames_with_host, frames.id == frames_with_host.frame_id, 'left_outer'
).join(
    frames_face_count, frames.id == frames_face_count.frame_id, 'left_outer'
).select(
    'frames.*',
    videos.channel_id,
    videos.canonical_show_id,
    frames_with_host.has_host,
    frames_face_count.face_count
).where(
    ((videos.threeyears_dataset == True) & (frames.number % func.floor(videos.fps * 3) == 0)) | \
    ((videos.threeyears_dataset == False) & (frames.number % func.ceil(videos.fps * 3) == 0))
)
frames = _annotate_frame_in_commercial(frames)

In [None]:
video_outfile = '/app/data/video_10y.csv'

print('Summing over frames')
with open(video_outfile, 'w') as f:
    f.write(','.join([
        'video_id',
#         'video_path',
        'channel', 
        'canonical_show', 
        'date', 
        'hour',
        'isoweekday', 
        'in_commercial', 
        'host_onscreen',
        'num_faces_onscreen',
        'screentime_seconds'
    ]))
    f.write('\r\n')
    
    row_count = 0
    for row in frames.groupBy(
        'video_id', 'channel_id', 'canonical_show_id', 'in_commercial', 'has_host', 'face_count'
    ).agg(
        func.count(func.lit(1)).alias('frame_count')
    ).sort(
        'channel_id', 'canonical_show_id', 'video_id', 'in_commercial', 'has_host', 'face_count'
    ).collect():
        video_id = row.video_id
        channel_id = row.channel_id
        cshow_id = row.canonical_show_id
        in_comm = row.in_commercial
        has_host = row.has_host == True
        face_count = row.face_count
        if face_count is None:
            face_count = 0
        video_path = video_id_to_path[video_id]
        date, hour = get_date_hour_from_path(video_path)
        week_day = get_isoweekday(date)
        f.write(','.join([
            str(video_id),
            channel_id_to_name[channel_id], 
            cshow_id_to_name[cshow_id],
            date, 
            str(hour),
            str(week_day),
            str(in_comm), 
            str(has_host),
            str(face_count),
            str(row.frame_count * 3)
        ]))
        f.write('\r\n')
        row_count += 1
        if row_count % 10000 == 0:
            print(row_count)
    print('Wrote {} rows'.format(row_count))