In [1]:
import os
import sys
import time
import itertools

import numpy as np
import pandas as pd

import torch

from ipywidgets import interact
import matplotlib.pyplot as plt
import seaborn as sns
import IPython
from IPython.display import Markdown, display
from tqdm.notebook import tqdm

In [2]:
sys.path.append('/Users/staveshemesh/Projects/shstav2/token_voken/src')
sys.path.append('/Users/staveshemesh/Projects/shstav2/token_voken')
from src.common.setup import syspath_append_projects
syspath_append_projects()
from src.common.path_resolvers import *
from src.common.constants import *
from src.common.display_utils import *
from src.common.commands import *
from src.common.status import *

In [4]:
DATA_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data/'

# Batch 1
BATCH1_INTERVALS_FILE = '20210506_135358_df_intervals_batch1_431.csv'
BATCH1_INTERVALS_PATH = os.path.join(DATA_ROOT, BATCH1_INTERVALS_FILE)
BATCH1_DATASET_FILE = '20210510_190411_df_dataset_batch1_15659.csv'
BATCH1_DATASET_PATH = os.path.join(DATA_ROOT, BATCH1_DATASET_FILE)
# Batch 2
BATCH2_INTERVALS_FILE = 'df_intervals_batch2_500_20210515_195921.csv'
BATCH2_INTERVALS_PATH = os.path.join(DATA_ROOT, BATCH2_INTERVALS_FILE)

# Valid intervals
INPUT_ITERATION = '20210419_220655'
VALID_INTERVALS_PATH = os.path.join(DATA_ROOT, INPUT_ITERATION, 'dataframes/df_intervals_valid.csv')

TIMESTR = time.strftime("%Y%m%d_%H%M%S")

# Read Data

In [12]:
df_intervals_batch1 = pd.read_csv(BATCH1_INTERVALS_PATH, dtype={'interval_id': object})
df_intervals_batch1.shape

(431, 33)

In [14]:
df_dataset_batch1 = pd.read_csv(BATCH1_DATASET_PATH, dtype={'interval_id': object, 'frame_selected': object})
df_dataset_batch1.shape

(15659, 19)

In [7]:
df_intervals_valid = pd.read_csv(VALID_INTERVALS_PATH, dtype={'interval_id': object})
df_intervals_valid.shape

(2118, 25)

In [35]:
df_dataset_batch1.frame_face_path.iloc[2000]

'/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/nG2pEffLEJo/101168/vokens/face_annot_224/00037/detected_face_0.png'

In [18]:

source = df_dataset_batch1.iloc[10].frame_face_path
target = source.replace('vokens/', '')
print(f'{source} -> {target}')

/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224/00069/detected_face_0.png -> /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/face_annot_224/00069/detected_face_0.png


In [None]:
from distutils.dir_util import copy_tree
copy_tree(source, "/x/y/z")

In [19]:
resolve_interval_face_annot_224_dir('100913')

TypeError: resolve_interval_face_annot_224_dir() missing 1 required positional argument: 'interval_id'

# Face Detection

In [21]:
df_dataset_batch1['has_path'] = df_dataset_batch1.frame_face_path.apply(os.path.exists)
df_with_caption(df_dataset_batch1['has_path'].value_counts().to_frame(), 'Existing Frames')

Unnamed: 0,has_path
True,14701
False,958


## MTCNN Model

In [8]:
from PIL import Image, ImageDraw, UnidentifiedImageError, ImageFont
from facenet_pytorch import MTCNN, extract_face

In [9]:
mtcnn = MTCNN(select_largest=False, thresholds=[0.9, 0.9, 0.9], device='cpu')

In [10]:
display_every = 187
FRAME_EXTENSION = 'jpg'

def create_face_images(interval_ids):
    for interval_id in tqdm(interval_ids):
        frames_dir = resolve_interval_frames_dir(df_intervals_valid, interval_id)
        print(frames_dir)
        frames = sorted(os.listdir(frames_dir))
        for frame_filename in frames:
            if frame_filename.endswith(f".{FRAME_EXTENSION}"):
                frame_fullpath = os.path.join(frames_dir, frame_filename)
                frame_id = int(frame_filename.split(".")[0])
                save_faces(interval_id, frame_id, frame_fullpath)

def save_faces(interval_id, frame_id, frame_path, debug=False):
    image = Image.open(frame_path)
    boxes, probs, points = mtcnn.detect(image, landmarks=True)
    img_draw = image.copy()
    draw = ImageDraw.Draw(img_draw)
    
    for i, (box, point, prob) in enumerate(zip(boxes, points, probs)):
        draw.rectangle(box.tolist(), width=10)
        for p in point:
            draw.rectangle((p - 10).tolist() + (p + 10).tolist(), width=5)
        detected_face_path = resolve_detected_face_path(df_intervals_valid, interval_id, frame_id, i, create=True)
        extract_face(image, box, image_size=224, margin=70, save_path=detected_face_path)
    
    annotated_faces_path = resolve_annot_faces_path(df_intervals_valid, interval_id, frame_id)
    img_draw.save(annotated_faces_path)
    
    debug_print(debug, frame_path, frame_id, annotated_faces_path, probs)
    

def debug_print(debug, frame_filename, frame_id, annotated_faces_path, probs):
    should_display = debug or (f'000.{FRAME_EXTENSION}' in frame_filename)
    if should_display:
        print(f'Frame id: {frame_id}. Ouput dir: {annotated_faces_path}..')
        print(f'probs: {probs}')
        display(IPython.display.Image(annotated_faces_path, height=500, width=500))

In [36]:
pending_interval_ids = df_dataset_batch1[~df_dataset_batch1['has_path']]['interval_id'].tolist()

In [51]:
df_dataset_batch1['frame_face_path'].tolist()[:10]

['/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224/00002/detected_face_0.png',
 '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224/00006/detected_face_0.png',
 '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224/00011/detected_face_0.png',
 '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224/00017/detected_face_0.png',
 '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224/00033/detected_face_0.png',
 '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224/00033/detected_face_0.png',
 '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224/00048/detected_face_0.png',
 '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224/00053/detected_face_0.png',


In [37]:
len(pending_interval_ids)

958

In [41]:
all_interval_ids = df_dataset_batch1['interval_id'].tolist()

In [42]:
all_interval_ids[:3]

['100913', '100913', '100913']

In [46]:
pending_interval_ids[:4]

['101027', '101037', '101084', '101084']

In [54]:
resolve_interval_frames_dir(df_intervals, '101926')

'/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/MepXBJjsNxs/101926/frames'

In [55]:
resolve_interval_frames_dir(df_intervals, '102025')

'/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/nG2pEffLEJo/102025/frames'

In [52]:
df_dataset_batch1[(~df_dataset_batch1['has_path']) & (df_dataset_batch1['interval_id'] == '101027')]

Unnamed: 0,word_original,bert_token,token_id,frame_selected,voken_id,interval_id,offset_start,offset_end,word_time,word_len_plus_1,word_start,word_end,frame_start,frame_end,frame_count,frame_path,frame_face_path,video_id,interval_time,has_path
789,online,online,3784,575,725,101027,603,609,38.3,7,603,609,570,580,10,575_101027_PuNIwYsz7PI,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,PuNIwYsz7PI,38.705372,False


In [53]:
df_dataset_batch1[(~df_dataset_batch1['has_path'])]['interval_id'].value_counts()

101926    75
102025    62
101951    61
104432    56
101741    50
101300    46
101928    44
104948    41
101969    40
101954    33
104420    33
101678    31
101302    30
104937    28
104788    27
101979    26
104775    25
104770    22
104885    22
101687    22
104853    19
101962    18
104921    15
101956    14
102079    14
104957    14
101980    14
104818    13
104928    11
104716     9
101686     8
104918     6
102300     6
101665     3
104470     2
101856     2
104973     2
101084     2
101676     1
105040     1
104613     1
101037     1
101361     1
104285     1
101791     1
101832     1
101695     1
104345     1
101205     1
101027     1
Name: interval_id, dtype: int64

In [39]:
for interval in 
resolve_interval_frames_dir(df_intervals, pending_interval_ids[10])

'/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/8-hahRWhFvg/101300/frames'

In [28]:
create_face_images(pending_interval_ids[:100])

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/PuNIwYsz7PI/101027/frames



FileNotFoundError: [Errno 2] No such file or directory: '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/PuNIwYsz7PI/101027/frames'