In [14]:
import os
import sys
import time
import itertools

import numpy as np
import pandas as pd

import torch

from ipywidgets import interact
import matplotlib.pyplot as plt
import seaborn as sns
import IPython
from IPython.display import Markdown, display
from tqdm.notebook import tqdm

In [10]:
sys.path.append('/Users/staveshemesh/Projects/shstav2/token_voken/src')
sys.path.append('/Users/staveshemesh/Projects/shstav2/token_voken')
from src.common.setup import syspath_append_projects
syspath_append_projects()
from src.common.path_resolvers import *
from src.common.constants import *
from src.common.display_utils import *
from src.common.commands import *
from src.common.status import *

In [12]:
DATA_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data'
INTERVALS_FILE = 'df_intervals_batch2_500_20210515_195921.csv'
INTERVALS_PATH = os.path.join(DATA_ROOT, INTERVALS_FILE)

TIMESTR = time.strftime("%Y%m%d_%H%M%S")

# Read Data

In [15]:
df_intervals = pd.read_csv(INTERVALS_PATH, dtype={'interval_id': object})

In [16]:
df_intervals.shape

(500, 35)

# Face Detection

## MTCNN Model

In [19]:
from PIL import Image, ImageDraw, UnidentifiedImageError
from facenet_pytorch import MTCNN, extract_face

In [22]:
mtcnn = MTCNN(keep_all=False, select_largest=False, device='cuda')

AssertionError: Torch not compiled with CUDA enabled

In [17]:
display_every = 187

def save_faces(interval_id, frame_filename, debug=False):
    frame_id, frame_path = resolve_paths(frame_filename, interval_id)
    
    image = Image.open(frame_path)
    boxes, probs, points = mtcnn.detect(image, landmarks=True)
    img_draw = image.copy()
    draw = ImageDraw.Draw(img_draw)
    
    for i, (box, point) in enumerate(zip(boxes, points)):
        draw.rectangle(box.tolist(), width=5)
        for p in point:
            draw.rectangle((p - 10).tolist() + (p + 10).tolist(), width=10)
        detected_face_path = resolve_detected_face_path(interval_id, frame_id, i, create=True)
        extract_face(image, box, image_size=224, margin=70, save_path=detected_face_path)
    
    annotated_faces_path = resolve_annot_faces_path(interval_id, frame_id)
    img_draw.save(annotated_faces_path)
    
    debug_print(debug, frame_filename, frame_id, annotated_faces_path, probs)
    

def debug_print(debug, frame_filename, frame_id, annotated_faces_path, probs):
    should_display = debug or ('000.png' in frame_filename)
    if should_display:
        print(f'Frame id: {frame_id}. Ouput dir: {annotated_faces_path}..')
        print(f'probs: {probs}')
        display(IPython.display.Image(annotated_faces_path, height=500, width=500))

In [285]:
resolve_interval_face_annot_224_dir('100913')

'/Users/staveshemesh/Projects/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224'

In [None]:
do_sample = False
if do_sample:
    # Just a sample test
    for idx, row in tqdm(df_inervals_batch1[['interval_id', 'interval_frames_dir']][1:3].iterrows()):
        interval_id, frames_dir = row['interval_id'], row['interval_frames_dir']
        print(frames_dir)
        frames = sorted(os.listdir(frames_dir))
        for frame_filename in tqdm(frames):
            if frame_filename.endswith(".png"):
                save_faces(interval_id, frame_filename)

In [None]:
# Just a sample test
for idx, row in tqdm(df_inervals_batch1[['interval_id', 'interval_frames_dir']][:4].iterrows()):
    interval_id, frames_dir = row['interval_id'], row['interval_frames_dir']
    print(frames_dir)
    frames = sorted(os.listdir(frames_dir))
    for frame_filename in tqdm(frames[:3]):
        if frame_filename.endswith(".png"):
            save_faces(interval_id, frame_filename)

In [359]:
def create_face_images(df_batch):
    for idx, row in tqdm(df_batch[['interval_id', 'interval_frames_dir']].iterrows()):
        interval_id, frames_dir = row['interval_id'], row['interval_frames_dir']
        print(frames_dir)
        frames = sorted(os.listdir(frames_dir))
        for frame_filename in frames:
            if frame_filename.endswith(".png"):
                save_faces(interval_id, frame_filename)

In [360]:
df_inervals_batch1.shape

(431, 35)

In [291]:
create_face_images(df_inervals_batch1)

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,...,interval_frames_dir,full_video_path,frames_dir_exists,frames_dir_content_size,frames_count,supposed_frames_count,missing_frames_count,has_completed_frames,pats_path,word_count
100,oliver,MepXBJjsNxs,101321,True,6.606607,00:06:46.10,00:06:52.71,http://www.youtube.com/watch?v=MepXBJjsNxs,Sugar_-_Last_Week_Tonight_with_John_Oliver_HBO...,0 days 00:06:46.100000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,True,3296,101,99,2,True,/Users/staveshemesh/Projects/PATS_DATA/Process...,10
66,oliver,r-ERajkMXw0,101188,True,5.805806,00:03:28.40,00:03:34.21,https://www.youtube.com/watch?v=r-ERajkMXw0,Right_To_Be_Forgotten_-_Last_Week_Tonight_with...,0 days 00:03:28.400000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,/Users/staveshemesh/Projects/PATS_DATA/Youtube...,True,2944,90,87,3,True,/Users/staveshemesh/Projects/PATS_DATA/Process...,18


In [361]:
df_inervals_batch1.columns

Index(['speaker', 'video_id', 'interval_id', 'valid', 'duration',
       'start_time_string', 'end_time_string', 'video_link', 'video_fn',
       'start_time', 'end_time', 'org_start_time', 'org_end_time',
       'max_frames_token', 'valid_duration', 'valid_single_token_per_frame',
       'video_downloded', 'valid_hd5', 'valid_max_token_duration',
       'valid_frames_count', 'video_downloaded', 'interval_video_path',
       'interval_video_downloaded', 'interval_frames_dir', 'full_video_path',
       'frames_dir_exists', 'frames_dir_content_size', 'frames_count',
       'supposed_frames_count', 'missing_frames_count', 'has_completed_frames',
       'pats_path', 'word_count', 'has_annot_224', 'face_annot_224_dir'],
      dtype='object')

In [None]:
df_inervals_batch1

In [389]:
def fcount(path):
    count1 = 0
    for root, dirs, files in os.walk(path):
        count1 += len(dirs)

    return count1

In [401]:
df_inervals_batch1['has_annot_224'] = df_inervals_batch1['interval_id'].apply(
    lambda interval_id: 
        os.path.exists(resolve_interval_face_annot_224_dir(interval_id)) and
        10 < fcount(resolve_interval_face_annot_224_dir(interval_id))
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [402]:
display_value_counts(df_inervals_batch1['has_annot_224'], 'Has Annotations')

Unnamed: 0,has_annot_224
True,354
False,77


In [367]:
display_value_counts(df_inervals_batch1['has_annot_224'], 'Has Annotations')

Unnamed: 0,has_annot_224
True,228
False,203


In [393]:
display_value_counts(df_inervals_batch1['has_annot_224'], 'Has Annotations')

Unnamed: 0,has_annot_224
True,230
False,201


In [369]:
# for d in (df_inervals_batch1[df_inervals_batch1['has_annot_224']]['interval_frames_dir'] + '/*png').tolist():
#     !rm {d}

In [432]:
def create_face_images(df_batch):
    for idx, row in tqdm(df_batch[['interval_id', 'interval_frames_dir']].iterrows()):
        interval_id, frames_dir = row['interval_id'], row['interval_frames_dir']
        if interval_id in ['102025', '102079', '102300', '104716', '104770', '104775', '104788', '104818', '104853', '104885', '104918', '104921']:
            continue
        print(frames_dir)
        frames = sorted(os.listdir(frames_dir))
        for frame_filename in frames:
            if frame_filename.endswith(".png"):
                save_faces(interval_id, frame_filename)

In [435]:
skip_intervals=[]

In [436]:
def save_faces(interval_id, frame_filename, debug=False):
    frame_id, frame_path = resolve_paths(frame_filename, interval_id)
    
    try:
        image = Image.open(frame_path)
    except UnidentifiedImageError:
        skip_intervals.append(interval_id)
        return
    boxes, probs, points = mtcnn.detect(image, landmarks=True)
    img_draw = image.copy()
    draw = ImageDraw.Draw(img_draw)
    
    for i, (box, point) in enumerate(zip(boxes, points)):
        draw.rectangle(box.tolist(), width=5)
        for p in point:
            draw.rectangle((p - 10).tolist() + (p + 10).tolist(), width=10)
        detected_face_path = resolve_detected_face_path(interval_id, frame_id, i, create=True)
        extract_face(image, box, image_size=224, margin=70, save_path=detected_face_path)
    
    annotated_faces_path = resolve_annot_faces_path(interval_id, frame_id)
    img_draw.save(annotated_faces_path)
    
    debug_print(debug, frame_filename, frame_id, annotated_faces_path, probs)
    

In [None]:
create_face_images(df_inervals_batch1[~df_inervals_batch1['has_annot_224']])

In [None]:
df_inervals_batch1[df_inervals_batch1['interval_id'] == '102025']

In [379]:
df_inervals_batch1.iloc[273].has_annot_224 = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value
