# Imports

In [30]:
import os
import h5py
import time
from subprocess import call
import youtube_dl

import numpy as np
import pandas as pd

import seaborn as sns
from pprint import pprint
import matplotlib.pyplot as plt
from ipywidgets import interact
from IPython.display import YouTubeVideo, Markdown, display, Video, Image

from tqdm.notebook import tqdm
tqdm.pandas()

In [31]:
pd.set_option('display.max_colwidth', 200)
pd.options.display.float_format = '{:,.2f}'.format

def printmd(string):
    display(Markdown(string))

# Constants

In [32]:
PATS_DATA_ROOT = '/Users/staveshemesh/Projects/PATS_DATA/'

INPUT_DATA_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data/'
INPUT_ITERATION = '20210419_220655'
VALID_INTERVALS_PATH = os.path.join(INPUT_DATA_ROOT, INPUT_ITERATION, 'dataframes/df_intervals_valid.csv')

SPEAKER_NAME = 'oliver'
# /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver
PATS_SPEAKER_VIZ_DIR = os.path.join(PATS_DATA_ROOT, f'Youtube/{SPEAKER_NAME}')

In [33]:
TIMESTR = time.strftime("%Y%m%d_%H%M%S")

OUTPUT_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data/'
OUTPUT_DIR = os.path.join(OUTPUT_ROOT, TIMESTR, 'dataframes')

In [34]:
LIST_BULLET = '  ◘ '
TAB = '&nbsp;&nbsp;&nbsp;&nbsp;'
VIDEO_ID_LEN = 11
FRAME_RATE = 15

In [35]:
COLS_VIEW = [
    'speaker', 'interval_id',
    'duration', 'start_time_string', 'end_time_string',
    'video_link'
]

# Helpers

## Resolvers

## Display Utils

In [36]:
def printmd(string):
    display(Markdown(string))

CAPTION_STYLE = {
    'selector': 'caption',
    'props': [
        ('color', 'navy'),
        ('font-size', '16px')
    ]
}

def display_df_with_caption(df, title):
    return df.style.set_caption(title).set_table_styles([CAPTION_STYLE])

def display_value_counts(series, title):
    df_value_counts = series.value_counts().to_frame().head()
    df_style = display_df_with_caption(df_value_counts, title)
    display(df_style)
    
def display_df_info(df):
    print(f'{LIST_BULLET}Videos: #{df["video_link"].nunique():,}')
    print(f'{LIST_BULLET}Intervals: #{df["interval_id"].nunique():,}')
    total_duration = df["duration"].sum()
    total_duration_string = time.strftime('%H hours, %M minutues, %S seconds', time.gmtime(total_duration))
    print(f'{LIST_BULLET}Total Duration: {total_duration_string} ({int(total_duration):,} seconds)')
    all_youtube = df_intervals['video_link'].str.contains('youtube').all()
    print(f'{LIST_BULLET}All are Youtube videos: {all_youtube}')

## Commands

In [37]:
SUCCESS_RETURN_CODE = 0

def run_command(command):
    result = call(command, shell=True)
    success = result == SUCCESS_RETURN_CODE
    status_symbol = '✅ ' if success else '❌'
    print(f'{status_symbol} {command}')

# Read Data

## df_intervals

In [38]:
df_intervals = pd.read_csv(VALID_INTERVALS_PATH, dtype={'interval_id': object})

In [39]:
df_intervals.head(n=2)

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,...,valid_single_token_per_frame,video_downloded,valid_hd5,valid_max_token_duration,valid_frames_count,video_downloaded,interval_video_path,interval_video_downloaded,interval_frames_dir,full_video_path
0,oliver,Tt-mpuR_QHQ,100912,True,20.75,00:10:26.55,00:10:47.31,https://www.youtube.com/watch?v=Tt-mpuR_QHQ,Puerto_Rico_-_Last_Week_Tonight_with_John_Oliver_HBO-Tt-mpuR_QHQ.webm,0 days 00:10:26.550000,...,True,False,True,True,True,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100912/100912.mp4,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100912/vokens/face_annot_224,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/Tt-mpuR_QHQ.mp4
1,oliver,Tt-mpuR_QHQ,100913,True,7.74,00:10:47.44,00:10:55.18,https://www.youtube.com/watch?v=Tt-mpuR_QHQ,Puerto_Rico_-_Last_Week_Tonight_with_John_Oliver_HBO-Tt-mpuR_QHQ.webm,0 days 00:10:47.440000,...,True,False,True,True,True,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/100913.mp4,True,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/vokens/face_annot_224,/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/Tt-mpuR_QHQ.mp4


In [40]:
printmd(f'**Valid intervals, Speaker {SPEAKER_NAME}**:')
display_df_info(df_intervals)

**Valid intervals, Speaker oliver**:

  ◘ Videos: #123
  ◘ Intervals: #2,118
  ◘ Total Duration: 08 hours, 13 minutues, 25 seconds (29,605 seconds)
  ◘ All are Youtube videos: True


# Video → Frames

In [41]:
START_FRAME = 0

In [46]:
display_value_counts(df_intervals['interval_video_downloaded'], 'Downloaded Interval Video')

Unnamed: 0,interval_video_downloaded
True,1992
False,126


In [61]:
def video_to_frames_and_delete(interval_video_path, interval_frames_dir):
    video_has_frames = os.path.exists(interval_frames_dir) and os.path.getsize(interval_frames_dir) > 0
    if not video_has_frames:
        video_to_frames(interval_video_path, interval_frames_dir)
    delete_video(interval_video_path)

def video_to_frames(interval_video_path, interval_frames_dir):
    os.makedirs(interval_frames_dir)
    command = f'ffmpeg -i "{interval_video_path}" -start_number {START_FRAME} -r {FRAME_RATE} "{interval_frames_dir}/$filename%05d.png"'
    run_command(command)
    
def delete_video(interval_video_path):
    if os.path.exists(interval_video_path):
        os.remove(interval_video_path)

In [62]:
df_batch = df_intervals[:800]
interval_video_paths = df_batch['interval_video_path'].tolist()
interval_frames_dir = df_batch['interval_frames_dir'].tolist()

In [64]:
for interval_video_path, interval_frames_dir in tqdm(zip(interval_video_paths, interval_frames_dir)):
    video_to_frames_and_delete(interval_video_path, interval_frames_dir)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

❌ ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/100913.mp4" -start_number 0 -r 15 "U/$filename%05d.png"
❌ ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100937/100937.mp4" -start_number 0 -r 15 "s/$filename%05d.png"
❌ ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/100945/100945.mp4" -start_number 0 -r 15 "e/$filename%05d.png"
❌ ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/100958/100958.mp4" -start_number 0 -r 15 "r/$filename%05d.png"
❌ ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100976/100976.mp4" -start_number 0 -r 15 "t/$filename%05d.png"
❌ ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100978/100978.mp4" -start_number 0 -r 15 "a/$filename%05d.png"
❌ ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/hWQiXv0sn9Y/100983/100983.mp4" -start_number 0 -r 15 "v/$filename%05d.png"
❌ ffmp

In [68]:
df_batch = df_intervals[800:1600]
interval_video_paths = df_batch['interval_video_path'].tolist()
interval_frames_dir = df_batch['interval_frames_dir'].tolist()

In [69]:
for interval_video_path, interval_frames_dir in tqdm(zip(interval_video_paths, interval_frames_dir)):
    video_to_frames_and_delete(interval_video_path, interval_frames_dir)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

✅  ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/PDylgzybWAw/104958/104958.mp4" -start_number 0 -r 15 "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/PDylgzybWAw/104958/vokens/face_annot_224/$filename%05d.png"
✅  ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/nG2pEffLEJo/104959/104959.mp4" -start_number 0 -r 15 "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/nG2pEffLEJo/104959/vokens/face_annot_224/$filename%05d.png"
✅  ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/4Z4j2CrJRn4/104973/104973.mp4" -start_number 0 -r 15 "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/4Z4j2CrJRn4/104973/vokens/face_annot_224/$filename%05d.png"
✅  ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/bq2_wSsDwkQ/104975/104975.mp4" -start_number 0 -r 15 "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/bq2_wSsDwkQ/104975/vokens/face_annot_224/$filename%05d.png"
✅  ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/You

OSError: [Errno 28] No space left on device: '/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/LfgSEwjAeno/105222/vokens'