In [278]:
import os
import time
import itertools

import numpy as np
import pandas as pd

import torch
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance

from ipywidgets import interact
import matplotlib.pyplot as plt
import seaborn as sns
import IPython
from IPython.display import Markdown, display
from tqdm.notebook import tqdm

In [216]:
LIST_BULLET = '  ◘ '
TAB = '&nbsp;&nbsp;&nbsp;&nbsp;'
VIDEO_ID_LEN = 11
FRAME_RATE = 15

In [294]:
DATA_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data'
INTERVALS_FILE = '20210420_111641/dataframes/df_intervals_with_frame_status.csv'
INTERVALS_PATH = os.path.join(DATA_ROOT, INTERVALS_FILE)

PATS_DATA_ROOT = '/Users/staveshemesh/Projects/PATS_DATA'
SPEAKER_NAME = 'oliver'

TIMESTR = time.strftime("%Y%m%d_%H%M%S")

In [218]:
def get_interval_row(interval_id):
    row = df_interval = df_intervals[df_intervals['interval_id'] == interval_id].iloc[0]
    return row

def get_video_id(interval_id):
    row = get_interval_row(interval_id)
    return row['video_id']

def resolve_interval_video_path(interval_id):
    video_id = get_video_id(interval_id)
    video_dir = os.path.join(DATA_ROOT, 'Youtube', SPEAKER_NAME, video_id)    
    interval_path = os.path.join(video_dir, interval_id, f'{interval_id}.mp4')
    return interval_path

def resolve_interval_frames_dir(interval_id, create=False):
    interval_video_path = resolve_interval_video_path(interval_id)
    interval_video_dir = os.path.dirname(interval_video_path)
    inetrval_frames_dir = os.path.join(interval_video_dir, 'frames')
    if create and not os.path.exists(inetrval_frames_dir):
        os.makedirs(inetrval_frames_dir)
    return inetrval_frames_dir


### FECNet Resolver

def resolve_interval_face_annot_dir(interval_id, create=True):
    interval_video_path = resolve_interval_video_path(interval_id)
    interval_video_dir = os.path.dirname(interval_video_path)
    inetrval_face_annot_dir = os.path.join(interval_video_dir, 'face_annot')
    if create and not os.path.exists(inetrval_face_annot_dir):
        os.makedirs(inetrval_face_annot_dir)
    return inetrval_face_annot_dir

def resolve_frame_face_annot_dir(interval_id, frame_id, create=True):
    face_annot_dir = resolve_interval_face_annot_224_dir(interval_id)
    single_frame_face_annot_dir = os.path.join(face_annot_dir, f"{frame_id:05d}")
    if create and not os.path.exists(single_frame_face_annot_dir):
        os.makedirs(single_frame_face_annot_dir)
    return single_frame_face_annot_dir

def resolve_detected_face_path(interval_id, frame_id, face_id, create=True):
    single_frame_face_annot_dir = resolve_frame_face_annot_dir(interval_id, frame_id)
    detected_face_frame_path = os.path.join(single_frame_face_annot_dir, 'detected_face_{}.png').format(face_id)
    return detected_face_frame_path

def resolve_annot_faces_path(interval_id, frame_id, ):
    single_frame_face_annot_dir = resolve_frame_face_annot_dir(interval_id, frame_id)
    return os.path.join(single_frame_face_annot_dir, 'annotated_faces.png')

def resolve_interval_face_annot_224_dir(interval_id, create=True):
    interval_video_path = resolve_interval_video_path(interval_id)
    interval_video_dir = os.path.dirname(interval_video_path)
    inetrval_face_annot_dir = os.path.join(interval_video_dir, 'vokens', 'face_annot_224')
    if create and not os.path.exists(inetrval_face_annot_dir):
        os.makedirs(inetrval_face_annot_dir)
    return inetrval_face_annot_dir

def resolve_224_voken_path(interval_id, frame_id):
    single_frame_face_annot_dir = resolve_interval_face_annot_224_dir(interval_id)
    detected_face_frame_path = os.path.join(single_frame_face_annot_dir, f'{frame_id:05d}.png').format(frame_id)
    return detected_face_frame_path

def extract_frame_id(frame_filename):
    return int(os.path.basename(frame_filename).split(".")[0])

In [245]:
display_every = 187

def save_faces(interval_id, frame_filename, debug=False):
    frame_id, frame_path = resolve_paths(frame_filename, interval_id)
    
    image = Image.open(frame_path)
    boxes, probs, points = mtcnn.detect(image, landmarks=True)
    img_draw = image.copy()
    draw = ImageDraw.Draw(img_draw)
    
    for i, (box, point) in enumerate(zip(boxes, points)):
        draw.rectangle(box.tolist(), width=5)
        for p in point:
            draw.rectangle((p - 10).tolist() + (p + 10).tolist(), width=10)
        detected_face_path = resolve_detected_face_path(interval_id, frame_id, i, create=True)
        extract_face(image, box, image_size=224, margin=70, save_path=detected_face_path)
    
    annotated_faces_path = resolve_annot_faces_path(interval_id, frame_id)
    img_draw.save(annotated_faces_path)
    
    debug_print(debug, frame_filename, frame_id, annotated_faces_path, probs)
    

def debug_print(debug, frame_filename, frame_id, annotated_faces_path, probs):
    should_display = debug or ('000.png' in frame_filename)
    if should_display:
        print(f'Frame id: {frame_id}. Ouput dir: {annotated_faces_path}..')
        print(f'probs: {probs}')
        display(IPython.display.Image(annotated_faces_path, height=500, width=500))        

def get_video_id(interval_id):
    df_interval = df_intervals[df_intervals['interval_id'] == interval_id]
    return df_interval.iloc[0]['video_id']
    
def resolve_interval_video_path(interval_id):
    video_id = get_video_id(interval_id)
    video_dir = os.path.join(PATS_DATA_ROOT, 'Youtube', SPEAKER_NAME, video_id)    
    interval_path = os.path.join(video_dir, interval_id, f'{interval_id}.mp4')
    return interval_path

def resolve_interval_frames_dir(interval_id, create=True):
    interval_video_path = resolve_interval_video_path(interval_id)
    interval_video_dir = os.path.dirname(interval_video_path)
    inetrval_frames_dir = os.path.join(interval_video_dir, 'frames')
    if create and not os.path.exists(inetrval_frames_dir):
        os.makedirs(inetrval_frames_dir)
    return inetrval_frames_dir

def resolve_paths(frame_filename, interval_id):
    frame_id = int(frame_filename.split(".")[0])
    face_annot_dir = resolve_interval_face_annot_224_dir(interval_id)
    
    frame_path = os.path.join(face_annot_dir, frame_filename)
    return frame_id, frame_path

## Display Utils

In [246]:
def printmd(string):
    display(Markdown(string))

CAPTION_STYLE = {
    'selector': 'caption',
    'props': [
        ('color', 'navy'),
        ('font-size', '16px')
    ]
}

def display_df_with_caption(df, title):
    return df.style.set_caption(title).set_table_styles([CAPTION_STYLE])

def display_value_counts(series, title):
    df_value_counts = series.value_counts().to_frame().head()
    df_style = display_df_with_caption(df_value_counts, title)
    display(df_style)

## Load Intervals

In [247]:
df_intervals = pd.read_csv(INTERVALS_PATH, dtype={'interval_id': object})

In [248]:
df_intervals.columns

Index(['speaker', 'video_id', 'interval_id', 'valid', 'duration',
       'start_time_string', 'end_time_string', 'video_link', 'video_fn',
       'start_time', 'end_time', 'org_start_time', 'org_end_time',
       'max_frames_token', 'valid_duration', 'valid_single_token_per_frame',
       'video_downloded', 'valid_hd5', 'valid_max_token_duration',
       'valid_frames_count', 'video_downloaded', 'interval_video_path',
       'interval_video_downloaded', 'interval_frames_dir', 'full_video_path',
       'frames_dir_exists', 'frames_dir_content_size', 'frames_count',
       'supposed_frames_count', 'missing_frames_count', 'has_completed_frames',
       'pats_path', 'word_count'],
      dtype='object')

In [249]:
df_intervals.shape

(2118, 33)

In [264]:
DELETED_VIDEO = 'hWQiXv0sn9Y'

### Fetch Batch

In [268]:
df_inervals_batch1 = df_intervals[(df_intervals['has_completed_frames']) & (df_intervals['video_id'] != DELETED_VIDEO)]

In [269]:
df_inervals_batch1.shape

(431, 33)

In [270]:
df_inervals_batch1.sample(n=4)

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,...,interval_frames_dir,full_video_path,frames_dir_exists,frames_dir_content_size,frames_count,supposed_frames_count,missing_frames_count,has_completed_frames,pats_path,word_count
140,oliver,r-ERajkMXw0,101444,True,20.153487,00:05:40.80,00:06:00.96,https://www.youtube.com/watch?v=r-ERajkMXw0,Right_To_Be_Forgotten_-_Last_Week_Tonight_with...,0 days 00:05:40.800000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,True,9824,305,302,3,True,/Users/staveshemesh/Projects/PATS_DATA/Process...,51
266,oliver,3bxcc3SM_KA,101991,True,4.604604,00:08:32.81,00:08:37.41,http://www.youtube.com/watch?v=3bxcc3SM_KA,Patents_-_Last_Week_Tonight_with_John_Oliver_H...,0 days 00:08:32.810000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,True,2336,71,69,2,True,/Users/staveshemesh/Projects/PATS_DATA/Process...,11
50,oliver,FVFdsl29s_Q,101142,True,9.666666,00:03:42.56,00:03:52.23,http://www.youtube.com/watch?v=FVFdsl29s_Q,Stupid_Watergate_-_Last_Week_Tonight_with_John...,0 days 00:03:42.560000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,True,4768,147,144,3,True,/Users/staveshemesh/Projects/PATS_DATA/Process...,27
3,oliver,P8pjd1QEA0c,100945,True,16.349683,00:09:46.95,00:10:03.30,http://www.youtube.com/watch?v=P8pjd1QEA0c,Student_Debt_-_Last_Week_Tonight_with_John_Oli...,0 days 00:09:46.950000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,True,8000,248,245,3,True,/Users/staveshemesh/Projects/PATS_DATA/Process...,30


In [271]:
batch_interval_ids = df_inervals_batch1[df_inervals_batch1['has_completed_frames']]['interval_id'].tolist()

In [283]:
batch_interval_filename = f'{TIMESTR}_df_intervals_batch1_{len(batch_interval_ids)}.csv'
batch_interval_path = os.path.join(DATA_ROOT, batch_interval_filename)
df_inervals_batch1.to_csv(batch_interval_path, index=False, header=True)
print(f'Saved to {batch_interval_path}.')

Saved to /Users/staveshemesh/Projects/shstav2/token_voken/data/20210506_135358_df_intervals_batch1_431.csv.


## MTCNN Model

In [430]:
from PIL import Image, ImageDraw, UnidentifiedImageError
from facenet_pytorch import MTCNN, extract_face

In [284]:
mtcnn = MTCNN(keep_all=False)

In [285]:
resolve_interval_face_annot_224_dir('100913')

'/Users/staveshemesh/Projects/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224'

In [None]:
do_sample = False
if do_sample:
    # Just a sample test
    for idx, row in tqdm(df_inervals_batch1[['interval_id', 'interval_frames_dir']][1:3].iterrows()):
        interval_id, frames_dir = row['interval_id'], row['interval_frames_dir']
        print(frames_dir)
        frames = sorted(os.listdir(frames_dir))
        for frame_filename in tqdm(frames):
            if frame_filename.endswith(".png"):
                save_faces(interval_id, frame_filename)

In [None]:
# Just a sample test
for idx, row in tqdm(df_inervals_batch1[['interval_id', 'interval_frames_dir']][:4].iterrows()):
    interval_id, frames_dir = row['interval_id'], row['interval_frames_dir']
    print(frames_dir)
    frames = sorted(os.listdir(frames_dir))
    for frame_filename in tqdm(frames[:3]):
        if frame_filename.endswith(".png"):
            save_faces(interval_id, frame_filename)

In [359]:
def create_face_images(df_batch):
    for idx, row in tqdm(df_batch[['interval_id', 'interval_frames_dir']].iterrows()):
        interval_id, frames_dir = row['interval_id'], row['interval_frames_dir']
        print(frames_dir)
        frames = sorted(os.listdir(frames_dir))
        for frame_filename in frames:
            if frame_filename.endswith(".png"):
                save_faces(interval_id, frame_filename)

In [360]:
df_inervals_batch1.shape

(431, 35)

In [291]:
create_face_images(df_inervals_batch1)

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,...,interval_frames_dir,full_video_path,frames_dir_exists,frames_dir_content_size,frames_count,supposed_frames_count,missing_frames_count,has_completed_frames,pats_path,word_count
100,oliver,MepXBJjsNxs,101321,True,6.606607,00:06:46.10,00:06:52.71,http://www.youtube.com/watch?v=MepXBJjsNxs,Sugar_-_Last_Week_Tonight_with_John_Oliver_HBO...,0 days 00:06:46.100000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,True,3296,101,99,2,True,/Users/staveshemesh/Projects/PATS_DATA/Process...,10
66,oliver,r-ERajkMXw0,101188,True,5.805806,00:03:28.40,00:03:34.21,https://www.youtube.com/watch?v=r-ERajkMXw0,Right_To_Be_Forgotten_-_Last_Week_Tonight_with...,0 days 00:03:28.400000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,True,2944,90,87,3,True,/Users/staveshemesh/Projects/PATS_DATA/Process...,18


In [361]:
df_inervals_batch1.columns

Index(['speaker', 'video_id', 'interval_id', 'valid', 'duration',
       'start_time_string', 'end_time_string', 'video_link', 'video_fn',
       'start_time', 'end_time', 'org_start_time', 'org_end_time',
       'max_frames_token', 'valid_duration', 'valid_single_token_per_frame',
       'video_downloded', 'valid_hd5', 'valid_max_token_duration',
       'valid_frames_count', 'video_downloaded', 'interval_video_path',
       'interval_video_downloaded', 'interval_frames_dir', 'full_video_path',
       'frames_dir_exists', 'frames_dir_content_size', 'frames_count',
       'supposed_frames_count', 'missing_frames_count', 'has_completed_frames',
       'pats_path', 'word_count', 'has_annot_224', 'face_annot_224_dir'],
      dtype='object')

In [None]:
df_inervals_batch1

In [389]:
def fcount(path):
    count1 = 0
    for root, dirs, files in os.walk(path):
        count1 += len(dirs)

    return count1

In [401]:
df_inervals_batch1['has_annot_224'] = df_inervals_batch1['interval_id'].apply(
    lambda interval_id: 
        os.path.exists(resolve_interval_face_annot_224_dir(interval_id)) and
        10 < fcount(resolve_interval_face_annot_224_dir(interval_id))
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [402]:
display_value_counts(df_inervals_batch1['has_annot_224'], 'Has Annotations')

Unnamed: 0,has_annot_224
True,354
False,77


In [367]:
display_value_counts(df_inervals_batch1['has_annot_224'], 'Has Annotations')

Unnamed: 0,has_annot_224
True,228
False,203


In [393]:
display_value_counts(df_inervals_batch1['has_annot_224'], 'Has Annotations')

Unnamed: 0,has_annot_224
True,230
False,201


In [369]:
# for d in (df_inervals_batch1[df_inervals_batch1['has_annot_224']]['interval_frames_dir'] + '/*png').tolist():
#     !rm {d}

In [432]:
def create_face_images(df_batch):
    for idx, row in tqdm(df_batch[['interval_id', 'interval_frames_dir']].iterrows()):
        interval_id, frames_dir = row['interval_id'], row['interval_frames_dir']
        if interval_id in ['102025', '102079', '102300', '104716', '104770', '104775', '104788', '104818', '104853', '104885', '104918', '104921']:
            continue
        print(frames_dir)
        frames = sorted(os.listdir(frames_dir))
        for frame_filename in frames:
            if frame_filename.endswith(".png"):
                save_faces(interval_id, frame_filename)

In [435]:
skip_intervals=[]

In [436]:
def save_faces(interval_id, frame_filename, debug=False):
    frame_id, frame_path = resolve_paths(frame_filename, interval_id)
    
    try:
        image = Image.open(frame_path)
    except UnidentifiedImageError:
        skip_intervals.append(interval_id)
        return
    boxes, probs, points = mtcnn.detect(image, landmarks=True)
    img_draw = image.copy()
    draw = ImageDraw.Draw(img_draw)
    
    for i, (box, point) in enumerate(zip(boxes, points)):
        draw.rectangle(box.tolist(), width=5)
        for p in point:
            draw.rectangle((p - 10).tolist() + (p + 10).tolist(), width=10)
        detected_face_path = resolve_detected_face_path(interval_id, frame_id, i, create=True)
        extract_face(image, box, image_size=224, margin=70, save_path=detected_face_path)
    
    annotated_faces_path = resolve_annot_faces_path(interval_id, frame_id)
    img_draw.save(annotated_faces_path)
    
    debug_print(debug, frame_filename, frame_id, annotated_faces_path, probs)
    

In [None]:
create_face_images(df_inervals_batch1[~df_inervals_batch1['has_annot_224']])

In [None]:
df_inervals_batch1[df_inervals_batch1['interval_id'] == '102025']

In [379]:
df_inervals_batch1.iloc[273].has_annot_224 = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
