In [None]:
import query.datasets.prelude
reload(query.datasets.prelude)
from query.datasets.prelude import *
from query.datasets.tvnews.shot_detect import shot_detect, shot_stitch
from query.datasets.tvnews.face_detect import face_detect
from query.datasets.tvnews.face_embed import face_embed
from query.datasets.tvnews.pose_detect import pose_detect
from query.datasets.tvnews.identity_detect import identity_detect
from query.datasets.tvnews.animatedness import shot_frame_to_detect

In [None]:
def bulk_update_copy(self, objects):
    with connection.cursor() as cursor:
        table = self.model._meta.db_table
        cursor.execute('BEGIN')
        cursor.execute('TRUNCATE TABLE {}'.format(table))
        bulk_create_copy(self, objects, table=table)
        cursor.execute('END')
        
with Timer('updating'):
    bulk_update_copy(Shot.objects, sorted(flatten(all_shots.values()), key=itemgetter('id')))
#Shot.objects.bulk_update([Shot(**d) for d in flatten(all_shots.values())], batch_size=50000, update_fields=['in_commercial'])

In [None]:
log.debug('End')

In [None]:
def load_commercials(all_commercials):
    to_save = []
    labeler, _ = Labeler.objects.get_or_create(name='haotian-commercials')
    for path, commercials in tqdm(all_commercials.iteritems()):
        v = all_videos['tvnews/videos/{}.mp4'.format(path)]
        for (min_frame, _), (max_frame, _) in commercials:
            to_save.append({
                'min_frame': min_frame,
                'max_frame': max_frame,
                'video_id': v.id,
                'labeler_id': labeler.id
            })
    Commercial.objects.bulk_create_copy(to_save)

In [None]:
all_commercials = pickle.load(open('/app/commercial_dict_2.pkl'))
load_commercials(all_commercials)

In [None]:
from query.datasets.tvnews.models import ThingType 
def load_segments(all_topics):
    all_videos = {v.path: v for v in tqdm(Video.objects.all().order_by('id'))}

    types = {
        'subject': ThingType.TOPIC,
        'phrase': ThingType.PHRASE,
        'people': ThingType.PERSON,
        'location': ThingType.LOCATION,
        'organization': ThingType.ORGANIZATION
    }

    seen = {
        t: set()
        for t in types.values()
    }

    things_to_save = []
    segments_to_save = []
    for path, segments in tqdm(all_topics.iteritems()):
        for (start, end), things in segments.iteritems():
            for k, l in things.iteritems():
                if k == 'sentiment':
                    pass
                else:
                    ty = types[k]
                    for obj in l:
                        if obj is None: continue
                        if ty == ThingType.PERSON:
                            obj = ' '.join(obj.split(', ')[::-1]).strip()
                        obj = obj.lower()
                        if obj not in seen[ty]:                        
                            things_to_save.append({
                                'name': obj,
                                'type': ty
                            })
                            seen[ty].add(obj)
    Thing.objects.bulk_create_copy(things_to_save)
    
    log.debug('Creating segments')
    segments_to_save = []
    labeler, _ = Labeler.objects.get_or_create(name='haotian-segments')
    for path, segments in tqdm(all_topics.iteritems()):
        v = all_videos['tvnews/videos/{}.mp4'.format(path)]
        for (start, end), things in segments.iteritems():
            (polarity, subjectivity) = things['sentiment']
            s = {
                'min_frame': int(start * v.fps),
                'max_frame': int(end * v.fps),
                'video_id': v.id,
                'labeler_id': labeler.id
            }
            if polarity is not None and subjectivity is not None:
                s['polarity'] = polarity
                s['subjectivity'] = subjectivity
            segments_to_save.append(s)
    Segment.objects.bulk_create_copy(segments_to_save)
    
    log.debug('Creating links')
    cur_segments = {(s['video_id'], s['min_frame'], s['max_frame']): s for s in tqdm(Segment.objects.all().values())}
    cur_things = {(t['name'], t['type']): t for t in tqdm(Thing.objects.all().values())}
    links_to_save = []
    for path, segments in tqdm(all_topics.iteritems()):
        v = all_videos['tvnews/videos/{}.mp4'.format(path)]
        for (start, end), things in segments.iteritems():
            s = cur_segments[(v.id, int(start*v.fps), int(end*v.fps))]        
            for k, l in things.iteritems():
                if k == 'sentiment':
                    pass
                else:
                    ty = types[k]
                    for obj in l:
                        if obj is None: continue
                        if ty == ThingType.PERSON:
                            obj = ' '.join(obj.split(', ')[::-1]).strip()
                        obj = obj.lower()
                        links_to_save.append({
                            'tvnews_segment_id': s['id'],
                            'tvnews_thing_id': cur_things[(obj, ty)]['id']
                        })
                        

    bulk_create_copy(Segment.things.through.objects, links_to_save)

In [None]:
all_topics = pickle.load(open('/app/topic_dict_res_2.pkl'))
load_segments(all_topics)

In [None]:
all_videos = list(tqdm(Video.objects.all().order_by('id')))
vids = all_videos[:100]
make_montage_video(vids, 0, 1000, '/app/montage.mkv', num_cols=10, width=1600, target_height=120)

In [None]:
with Timer('Detecting shots'):
    import query.datasets.tvnews.shot_detect
    reload(query.datasets.tvnews.shot_detect)
    from query.datasets.tvnews.shot_detect import shot_detect
    log.debug('Loading videos')
    all_videos = list(tqdm(Video.objects.all().order_by('id')))
    shot_indices, all_shots, all_blackframes = shot_detect(all_videos)
    shot_videos = gather(all_videos, shot_indices)
    log.debug('Computing face frames to detect')
    face_frame_per_shot = [[shot_frame_to_detect(shot) for shot in vid_shots]
                           for vid_shots in tqdm(all_shots)]

In [None]:
with Timer('Detecting sparse face'):
    import query.datasets.tvnews.face_detect
    reload(query.datasets.tvnews.face_detect)
    from query.datasets.tvnews.face_detect import face_detect
    
    all_faces, indices = face_detect(shot_videos, face_frame_per_shot)
    face_videos = gather(shot_videos, indices)
    face_shots = gather(all_shots, indices)
    face_frames = gather(face_frame_per_shot, indices)
    print(len(all_faces))

In [None]:
with Timer('Saving faces'):
    frames_to_save = [
        [{
            'video_id': video.id,
            'number': f
        } for f in frames]
        for (video, frames) in zip(face_videos, face_frames)
    ]
    Frame.objects.bulk_create_copy(flatten(frames_to_save))
    
    labeler, _ = Labeler.objects.get_or_create(name='mtcnn')
    for (frames, vid_faces) in zip(frames_to_save, all_faces):
        for (frame, faces) in zip(frames, vid_faces):
            for face in faces:
                face['frame_id'] = frame['id']
                face['labeler_id'] = labeler.id
    Face.objects.bulk_create_copy(flatten(all_faces))

In [None]:
make_montage(face_videos[0], face_frames[0], '/app/montage.jpg', bboxes=all_faces[0])

In [None]:
def output_name(video, frames):
    return video.path + '_faces_' + str(hash(tuple(frames)))

face_tables = [output_name(video, frames) for video, frames in tqdm(zip(face_videos, face_frames))]

In [None]:
with Timer('Gender faces'):
    import query.datasets.tvnews.gender_detect
    reload(query.datasets.tvnews.gender_detect)
    from query.datasets.tvnews.gender_detect import gender_detect
    
    gender_detect(face_videos, face_frames, face_tables)

In [None]:
with Timer("Embedding faces"):
    import query.datasets.tvnews.face_embed
    reload(query.datasets.tvnews.face_embed)
    from query.datasets.tvnews.face_embed import face_embed

    face_embed(face_videos, face_frames, face_tables)

In [None]:
def output_name(video, frames):
    return video.path + '_embeddings_' + str(hash(tuple(frames)))

with make_scanner_db() as db:
    db._load_db_metadata()
    
indices, embed_tables = unzip([
    (i, db.table(output_name(video, frames)))
    for i, (video, frames) in tqdm(enumerate(zip(face_videos, face_frames)))
    if db.has_table(output_name(video, frames)) and db.table(output_name(video, frames)).committed()
])
print(len(indices))

embed_videos, embed_frames, embed_faces, embed_shots = map(lambda l: gather(l, indices),
                                                          (face_videos, face_frames, all_faces, face_shots))


In [None]:
spark = SparkWrapper()

In [None]:
with Timer("Embedding faces"):
    def load_embs():
        log.debug('Loading embs')
        EMBEDDING_SIZE = 128
        def load(t):
            embs = list(t.column('embeddings').load())
            arrays = [np.frombuffer(emb, dtype=np.float32) if emb is not None else [] for _, emb in embs]
            return [np.split(a, len(a) / 128) if len(a) > 0 else [] for a in arrays]
        return par_for(load, embed_tables, workers=32)

    all_embs = pcache.get('all_embs', load_embs, method='pickle')


In [None]:
def load_flat_embs():
    with Timer("Embedding faces"):
        def load_embs():
            log.debug('Loading embs')
            EMBEDDING_SIZE = 128
            def load(t):
                embs = list(t.column('embeddings').load())
                arrays = [np.frombuffer(emb, dtype=np.float32) if emb is not None else [] for _, emb in embs]
                return [np.split(a, len(a) / 128) if len(a) > 0 else [] for a in arrays]
            return par_for(load, embed_tables, workers=32)

        all_embs = pcache.get('all_embs', load_embs, method='pickle')
        
    with Timer('Pinging db'):
        db_faces = list(Face.objects.all().values('id', 'person__frame__video__id', 'person__frame__number', 'bbox_x1'))
    d1 = {k: collect(f, itemgetter('person__frame__number')) for k, f in collect(db_faces, itemgetter('person__frame__video__id')).iteritems()}

    EPSILON = 0.0001
    for (video, vid_faces, vid_frames) in tqdm(zip(embed_videos, embed_faces, embed_frames)):
        for (frame_faces, frame) in zip(vid_faces, vid_frames):
            for face in frame_faces:
                for face2 in d1[video.id][frame]:
                    if abs(face['bbox_x1'] - face2['bbox_x1']) < EPSILON:
                        face['id'] = face2['id']
                        face_id = face2['id']
                        break
        
    return unzip([
        (face['id'], emb)
        for (vid_faces, vid_embs) in tqdm(zip(embed_faces, all_embs))
        for (frame_faces, frame_embs) in zip(vid_faces, vid_embs)
        for (face, emb) in zip(frame_faces, frame_embs)
    ])
    
face_ids, all_embs_flat = pcache.get(
    ('face_ids', 'all_embs_flat'), load_flat_embs, method=('pickle', 'numpy'), dtype=np.float32, length=128)

In [None]:
# with open('/app/anchor_test_txt.txt') as f:
#     paths = ['tvnews/videos/{}.mp4'.format(s.strip()) for s in f.readlines()]
    
vid_map = {v.path: i for i, v in enumerate(embed_videos)}
#indices = [vid_map[p] for p in paths]
indices = list(range(len(embed_videos)))
from itertools import izip
        
for_haotian = [
    {
        'video': video.path,
        'shots': [{
            'min_frame': s['min_frame'],
            'max_frame': s['max_frame'],
            'face_frame': frame,
            'faces': zip(fr_faces, fr_embs)
        } for s, frame, fr_faces, fr_embs in zip(shots, frames, faces, embs)
        if len(fr_faces) >= 1 and len(fr_faces) <= 3]
    }
    for (video, shots, frames, faces, embs) in
    tqdm(izip(gather(embed_videos, indices), 
              gather(embed_shots, indices), 
              gather(embed_frames, indices), 
              gather(embed_faces, indices), 
              gather(all_embs, indices)))
]

In [None]:
pickle.dump(for_haotian, open('/app/for_haotian.pkl', 'wb'))

In [None]:
def load_facefeatures():
    return spark.dicts_to_df([
        {'face_id': face['id'], 'embedding': emb}
        for (vid_faces, vid_embs) in tqdm(zip(embed_faces, all_embs))
        for (frame_faces, frame_embs) in zip(vid_faces, vid_embs)
        for (face, emb) in zip(frame_faces, frame_embs)
    ])

featues_df = spark.load('facefeatures', load_facefeatures)

In [None]:
features_df.rdd.map(lambda d: distance.euclidean(d['embedding'], )

In [None]:
from itertools import izip
for_sahaj = [
    {
        'video': video.path,
        'shots': [{
            'min_frame': s['min_frame'],
            'max_frame': s['max_frame'],
            'face_frame': frame,
            'faces': zip(fr_faces, fr_embs)
        } for s, frame, fr_faces, fr_embs in zip(shots, frames, faces, embs)]
    }
    for (video, shots, frames, faces, embs) in
    tqdm(izip(embed_videos[:1000], embed_shots, embed_frames, embed_faces, all_embs))
]

In [None]:
pickle.dump(for_sahaj, open('sahaj-data.pkl', 'wb'))

In [None]:
with Timer("Stitching shots"):    
    def load_stitches():
        log.debug('Computing stitches')
        import query.datasets.tvnews.shot_detect
        reload(query.datasets.tvnews.shot_detect)
        return query.datasets.tvnews.shot_detect.shot_stitch(embed_videos, embed_shots, embed_frames, embed_faces, all_embs)
    (stitched_shots, stitched_indices) = pcache.get('stitched_shots', load_stitches)

In [None]:
all_embs = embed_faces
stitched_videos = embed_videos
stitched_frames, stitched_faces, stitched_embs = map(
    lambda t: [gather(l, idx) for l, idx in zip(t, stitched_indices)],
    (embed_frames, embed_faces, all_embs))

In [None]:
show = Show.objects.get(name='The Rachel Maddow Show').id
indices = [i for i, video in enumerate(stitched_videos) if video.show_id == show]

with Timer('Detecting identities'):
    def load_identities():
        log.debug('Computing identities')
        import query.datasets.tvnews.identity_detect
        reload(query.datasets.tvnews.identity_detect)

        return query.datasets.tvnews.identity_detect.identity_detect(
           gather(stitched_videos, indices), "/app/rachel-maddow.jpg", gather(stitched_embs, indices))
    
    matching_indices = pcache.get('matching_indices', load_identities)
    
print(len(matching_indices), sum([len(l) for l in matching_indices]))

In [None]:
import random
flat_indices = [(i, j, k) for i, idx in enumerate(matching_indices) for (j, k) in idx]
random_indices = random.sample(flat_indices, 16)
identity_videos = gather(stitched_videos, indices)
identity_frames = gather(stitched_frames, indices)
make_montage(
    [identity_videos[i] for i, _1, _2 in random_indices],
    [identity_frames[i][j] for i, j, _ in random_indices],
    'montage.jpg',
    num_cols=4,
    target_height=240)

In [None]:
matching_indices_onelevel = [[j for j, k in l] for l in matching_indices]
matching_videos, matching_shots, matching_frames, matching_faces = \
    (gather(filter3_videos, indices),      
    [gather(shots, idxs) for (shots, idxs) in zip(gather(stitched_shots, indices), matching_indices_onelevel)],
    [gather(frames, idxs) for (frames, idxs) in zip(gather(filter3_frames, indices), matching_indices_onelevel)],
    [gather2(faces, idxs) for (faces, idxs) in zip(gather(filter3_faces, indices), matching_indices)])
    
#pose_blacklist = ['tvnews/videos/MSNBCW_20170708_010000_The_Rachel_Maddow_Show.mp4']    
#pose_blacklist = ['tvnews/videos/MSNBCW_20170506_040000_The_Rachel_Maddow_Show.mp4']
pose_blacklist = []
    
matching2_videos, matching2_shots, matching2_frames, matching2_faces = unzip([
    (video, shots, frames, faces)
    for video, shots, frames, faces in zip(matching_videos, matching_shots, matching_frames, matching_faces)
    if len(shots) > 0 and video.path not in pose_blacklist
])
    
TARGET_FPS = 10
pose_frames = [
    sum([list(range(s['min_frame'], s['max_frame'], int(round(video.fps / TARGET_FPS)))) for s in shots], [])
    for (video, shots) in zip(matching2_videos, matching2_shots)    
]

In [None]:
import query.datasets.tvnews.pose_detect
reload(query.datasets.tvnews.pose_detect)
from query.datasets.tvnews.pose_detect import pose_detect

all_poses = pose_detect(matching2_videos, pose_frames, force=True)

In [None]:
import query.datasets.tvnews.pose_detect
reload(query.datasets.tvnews.pose_detect)
from query.datasets.tvnews.pose_detect import pose_track


#pose_track(matching2_videos, matching2_shots, matching2_frames, matching2_faces, all_poses)

In [None]:
def bbox_montage((i, (video, frames, faces, matches))):
    if len(matches) == 0:
        print(video.path)
        return
    make_montage(
        video,
        [frames[j] for j, _ in matches],
        '/tmp/montage{}.jpg'.format(i),
        bboxes=[[faces[j][k]] for j, k in matches],
        progress=False)

In [None]:
import query.datasets.prelude
reload(query.datasets.prelude)
from query.datasets.prelude import *

# make_montage(filter3_videos[indices[i]], filter3_frames[indices[i]],
#              '/app/montage.jpg', filter3_faces[indices[i]], workers=96, progress=True)

def bbox_montage((i, (video, frames, faces, matches))):
    try:
        if len(matches) == 0:
            print(video.path)
            return
        bbox_map = defaultdict(list, {j: [faces[j][k]] for j, k in matches})
        make_montage(
            video,
            #[frames[j] for j, _ in matches],
            frames,
            '/tmp/montage{}.jpg'.format(i),
            bboxes=[bbox_map[i] for i in range(len(frames))],
            #bboxes=[[faces[j][k]] for j, k in matches],
            progress=False)
    except Exception:
        traceback.print_exc()
        print(video.path)
    
_ = par_for(bbox_montage,
        list(enumerate(zip(gather(filter3_videos, indices)[:100], gather(filter3_frames, indices), 
            gather(filter3_faces, indices), matching_indices))),
        process=True,
        workers=8)

In [None]:
import requests
import cv2

par_for(make_montage,
        list(enumerate(zip(filter2_videos, [[s['min_frame'] for s in l] for l in stitched_shots][:100]))),
        process=True,
        workers=8)

In [None]:
Shot.objects.bulk_create_copy({
    'min_frame': shot['min_frame'], 
    'max_frame': shot['max_frame'], 
    'labeler_id': shot['labeler'],
    'video_id': shot['video__id']
} for shot_list in tqdm(stitched_shots) for shot in shot_list])