# Imports

In [1]:
import os
import sys
import time

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from ipywidgets import interact
from IPython.display import YouTubeVideo, Markdown, display, Video, Image

from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
pd.set_option('display.max_colwidth', 200)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
sys.path.append('/Users/staveshemesh/Projects/shstav2/token_voken/src')
sys.path.append('/Users/staveshemesh/Projects/shstav2/token_voken')
from src.common.setup import syspath_append_projects
syspath_append_projects()

In [4]:
from src.common.path_resolvers import *
from src.common.constants import *
from src.common.display_utils import *
from src.common.commands import *
from src.common.status import *
from src.pipeline.video_to_frames import video_to_frames_and_delete

In [5]:
INPUT_DATA_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data/'
INPUT_ITERATION = '20210419_220655'
VALID_INTERVALS_PATH = os.path.join(INPUT_DATA_ROOT, INPUT_ITERATION, 'dataframes/df_intervals_valid.csv')

In [27]:
TIMESTR = time.strftime("%Y%m%d_%H%M%S")

OUTPUT_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data/'

# Read Data

## df_intervals

In [7]:
df_intervals = pd.read_csv(VALID_INTERVALS_PATH, dtype={'interval_id': object})

In [8]:
df_intervals.head(n=2)

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,...,valid_single_token_per_frame,video_downloded,valid_hd5,valid_max_token_duration,valid_frames_count,video_downloaded,interval_video_path,interval_video_downloaded,interval_frames_dir,full_video_path
0,oliver,Tt-mpuR_QHQ,100912,True,20.75,00:10:26.55,00:10:47.31,https://www.youtube.com/watch?v=Tt-mpuR_QHQ,Puerto_Rico_-_Last_Week_Tonight_with_John_Oliver_HBO-Tt-mpuR_QHQ.webm,0 days 00:10:26.550000,...,True,False,True,True,True,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100912/100912.mp4,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100912/vokens/face_annot_224,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/Tt-mpuR_QHQ.mp4
1,oliver,Tt-mpuR_QHQ,100913,True,7.74,00:10:47.44,00:10:55.18,https://www.youtube.com/watch?v=Tt-mpuR_QHQ,Puerto_Rico_-_Last_Week_Tonight_with_John_Oliver_HBO-Tt-mpuR_QHQ.webm,0 days 00:10:47.440000,...,True,False,True,True,True,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/100913.mp4,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/Tt-mpuR_QHQ.mp4


In [9]:
printmd(f'**Valid intervals, Speaker {SPEAKER_NAME}**:')
display_df_info(df_intervals)

**Valid intervals, Speaker oliver**:

  ◘ Videos: #123
  ◘ Intervals: #2,118
  ◘ Total Duration: 08 hours, 13 minutues, 25 seconds (29,605 seconds)
  ◘ All are Youtube videos: True


## Interval Data Status

In [10]:
status_frames(df_intervals)

2021-05-15 17:23:48,230 | INFO : Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2021-05-15 17:23:48,231 | INFO : NumExpr defaulting to 8 threads.


In [11]:
display_value_counts(df_intervals['interval_video_downloaded'], '[1] Downloaded Interval Video')
display_value_counts(df_intervals['frames_dir_exists'],         '[2] Frame Dir Exsits')
display_value_counts(df_intervals['frames_dir_content_size'],   '[3] Frames Dir Has Content')
display_value_counts(df_intervals['has_completed_frames'],      '[4] Has Completed Frames (Video -> Frames)')
display_value_counts(df_intervals['has_detected_faces'],        '[5] Detected Face')
display_value_counts(df_intervals['need_to_extract_frames'],    '[6] Need To Extract Frames')

Unnamed: 0,interval_video_downloaded
True,1992
False,126


Unnamed: 0,frames_dir_exists
False,1487
True,631


Unnamed: 0,frames_dir_content_size
False,2078
True,40


Unnamed: 0,has_completed_frames
False,1697
True,421


Unnamed: 0,has_detected_faces
False,1764
True,354


Unnamed: 0,need_to_extract_frames
True,1696
False,422


In [12]:
df_batch2 = df_intervals[(df_intervals['need_to_extract_frames'])&((df_intervals['status_interval_video_downloaded']))].iloc[:100]
df_batch2.shape

(100, 34)

In [20]:
df_batch3 = df_intervals[(df_intervals['need_to_extract_frames'])&((df_intervals['status_interval_video_downloaded']))].iloc[100:500]
df_batch3.shape

(400, 34)

In [21]:
interval_video_paths = df_batch3['interval_video_path'].tolist()
interval_frames_dirs = df_batch3['interval_frames_dir'].tolist()

In [22]:
interval_video_paths[10:12]

['/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/_S2G8jhhUHg/102918/102918.mp4',
 '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/QplQL5eAxlY/102919/102919.mp4']

In [23]:
interval_frames_dirs[10:12]

['/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/_S2G8jhhUHg/102918/vokens/face_annot_224',
 '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/QplQL5eAxlY/102919/vokens/face_annot_224']

In [25]:
df_batch2 = df_intervals[(df_intervals['need_to_extract_frames'])&((df_intervals['status_interval_video_downloaded']))].iloc[:500]
df_batch2.shape

(500, 34)

In [28]:
len(df_batch2)

500

In [29]:
batchfile = f'df_intervals_batch2_{len(df_batch2)}_{TIMESTR}'
batch_path = os.path.join(OUTPUT_ROOT, batchfile)

In [30]:
batch_path

'/Users/staveshemesh/Projects/shstav2/token_voken/data/df_intervals_batch2_500_20210515_195921'

In [31]:
df_batch2.to_csv(batch_path, header=True)

In [None]:
for interval_video_path, interval_frames_dir in tqdm(zip(interval_video_paths, interval_frames_dirs)):
    video_to_frames_and_delete(interval_video_path, interval_frames_dir, override=True)