# Imports

In [1]:
import os
import sys
import time

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from ipywidgets import interact
from IPython.display import YouTubeVideo, Markdown, display, Video, Image

from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
pd.set_option('display.max_colwidth', 200)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
sys.path.append('/Users/staveshemesh/Projects/shstav2/token_voken/src')
sys.path.append('/Users/staveshemesh/Projects/shstav2/token_voken')
from src.common.setup import syspath_append_projects
syspath_append_projects()
from src.common.path_resolvers import *
from src.common.constants import *
from src.common.display_utils import *
from src.common.commands import *
from src.common.status import *

In [4]:
INPUT_DATA_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data/'
INPUT_ITERATION = '20210419_220655'
VALID_INTERVALS_PATH = os.path.join(INPUT_DATA_ROOT, INPUT_ITERATION, 'dataframes/df_intervals_valid.csv')

In [5]:
TIMESTR = time.strftime("%Y%m%d_%H%M%S")

OUTPUT_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data/'

# Read Data

## df_intervals

In [6]:
df_intervals = pd.read_csv(VALID_INTERVALS_PATH, dtype={'interval_id': object})

In [7]:
df_intervals.head(n=2)

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,...,valid_single_token_per_frame,video_downloded,valid_hd5,valid_max_token_duration,valid_frames_count,video_downloaded,interval_video_path,interval_video_downloaded,interval_frames_dir,full_video_path
0,oliver,Tt-mpuR_QHQ,100912,True,20.75,00:10:26.55,00:10:47.31,https://www.youtube.com/watch?v=Tt-mpuR_QHQ,Puerto_Rico_-_Last_Week_Tonight_with_John_Oliver_HBO-Tt-mpuR_QHQ.webm,0 days 00:10:26.550000,...,True,False,True,True,True,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100912/100912.mp4,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100912/vokens/face_annot_224,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/Tt-mpuR_QHQ.mp4
1,oliver,Tt-mpuR_QHQ,100913,True,7.74,00:10:47.44,00:10:55.18,https://www.youtube.com/watch?v=Tt-mpuR_QHQ,Puerto_Rico_-_Last_Week_Tonight_with_John_Oliver_HBO-Tt-mpuR_QHQ.webm,0 days 00:10:47.440000,...,True,False,True,True,True,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/100913.mp4,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/Tt-mpuR_QHQ.mp4


In [8]:
printmd(f'**Valid intervals, Speaker {SPEAKER_NAME}**:')
display_df_info(df_intervals)

**Valid intervals, Speaker oliver**:

  ◘ Videos: #123
  ◘ Intervals: #2,118
  ◘ Total Duration: 08 hours, 13 minutues, 25 seconds (29,605 seconds)
  ◘ All are Youtube videos: True


## Interval Data Status

In [9]:
status_frames(df_intervals)

In [10]:
display_value_counts(df_intervals['interval_video_downloaded'], '[1] Downloaded Interval Video')
display_value_counts(df_intervals['frames_dir_exists'],         '[2] Frame Dir Exsits')
display_value_counts(df_intervals['frames_dir_content_size'],   '[3] Frames Dir Has Content')
display_value_counts(df_intervals['has_completed_frames'],      '[4] Has Completed Frames (Video -> Frames)')
display_value_counts(df_intervals['has_detected_faces'],        '[5] Detected Face')
display_value_counts(df_intervals['need_to_extract_frames'],    '[6] Need To Extract Frames')

Unnamed: 0,interval_video_downloaded
True,1992
False,126


Unnamed: 0,frames_dir_exists
False,1388
True,730


Unnamed: 0,frames_dir_content_size
False,2075
True,43


Unnamed: 0,has_completed_frames
False,1525
True,593


Unnamed: 0,has_detected_faces
False,2107
True,11


Unnamed: 0,need_to_extract_frames
True,1514
False,604


In [11]:
df_batch2 = df_intervals[(df_intervals['need_to_extract_frames'])&((df_intervals['status_interval_video_downloaded']))].iloc[:500]
df_batch2.shape

(500, 34)

In [19]:
df_intervals[df_intervals['has_detected_faces']].sample(n=4)

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,...,full_video_path,status_interval_video_downloaded,frames_dir_exists,frames_count,supposed_frames_count,missing_frames_count,has_completed_frames,frames_dir_content_size,has_detected_faces,need_to_extract_frames
249,oliver,6UsHHOCH4q8,101891,True,10.28,00:12:03.32,00:12:13.60,http://www.youtube.com/watch?v=6UsHHOCH4q8,Tobacco_-_Last_Week_Tonight_with_John_Oliver_HBO-6UsHHOCH4q8.mkv,0 days 00:12:03.320000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/6UsHHOCH4q8/6UsHHOCH4q8.mp4,False,False,-1,154,155,False,False,True,False
247,oliver,6UsHHOCH4q8,101889,True,9.28,00:11:46.30,00:11:55.58,http://www.youtube.com/watch?v=6UsHHOCH4q8,Tobacco_-_Last_Week_Tonight_with_John_Oliver_HBO-6UsHHOCH4q8.mkv,0 days 00:11:46.300000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/6UsHHOCH4q8/6UsHHOCH4q8.mp4,False,False,-1,139,140,False,False,True,False
195,oliver,P8pjd1QEA0c,101678,True,10.74,00:12:29.51,00:12:40.26,http://www.youtube.com/watch?v=P8pjd1QEA0c,Student_Debt_-_Last_Week_Tonight_with_John_Oliver_HBO-P8pjd1QEA0c.mkv,0 days 00:12:29.510000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/P8pjd1QEA0c.mp4,True,False,-1,161,162,False,True,True,False
199,oliver,6UsHHOCH4q8,101683,True,6.41,00:01:55.94,00:02:02.35,http://www.youtube.com/watch?v=6UsHHOCH4q8,Tobacco_-_Last_Week_Tonight_with_John_Oliver_HBO-6UsHHOCH4q8.mkv,0 days 00:01:55.940000,...,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/6UsHHOCH4q8/6UsHHOCH4q8.mp4,False,False,-1,96,97,False,False,True,False


In [17]:
interval_video_paths = df_batch3['interval_video_path'].tolist()
interval_frames_dirs = df_batch3['interval_frames_dir'].tolist()

NameError: name 'df_batch3' is not defined

In [None]:
interval_video_paths[10:12]

In [None]:
interval_frames_dirs[10:12]

In [25]:
df_batch2 = df_intervals[(df_intervals['need_to_extract_frames'])&((df_intervals['status_interval_video_downloaded']))].iloc[:500]
df_batch2.shape

(500, 34)

In [28]:
len(df_batch2)

500

In [29]:
batchfile = f'df_intervals_batch2_{len(df_batch2)}_{TIMESTR}'
batch_path = os.path.join(OUTPUT_ROOT, batchfile)

In [31]:
df_batch2.to_csv(batch_path, header=True)

In [6]:
from src.pipeline._4_video_to_frames import video_to_frames_and_delete

In [11]:
interval_video_path = '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/8-hahRWhFvg/101302/101302.mp4'
interval_frames_dir = '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/8-hahRWhFvg/101302/vokens/face_annot_224'

In [12]:
video_to_frames_and_delete(interval_video_path, interval_frames_dir, override=True)

2021-05-16 23:48:08,502 | INFO : ✅  ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/8-hahRWhFvg/101302/101302.mp4" -start_number 0 -r 15 -q:v 2 -qmin 2 -qmax 2 "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/8-hahRWhFvg/101302/vokens/face_annot_224/$filename%05d.jpg"


In [13]:
# P8pjd1QEA0c_101678
interval_video_path = '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/101678/101678.mp4'
interval_frames_dir = '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/101678/vokens/face_annot_224'
video_to_frames_and_delete(interval_video_path, interval_frames_dir, override=True)

2021-05-17 12:24:34,799 | INFO : ✅  ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/101678/101678.mp4" -start_number 0 -r 15 -q:v 2 -qmin 2 -qmax 2 "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/101678/vokens/face_annot_224/$filename%05d.jpg"


In [None]:
for interval_video_path, interval_frames_dir in tqdm(zip(interval_video_paths, interval_frames_dirs)):
    video_to_frames_and_delete(interval_video_path, interval_frames_dir, override=True)