# Imports

In [1]:
import os
import h5py
import time
from subprocess import call
import youtube_dl

import numpy as np
import pandas as pd

import seaborn as sns
from pprint import pprint
import matplotlib.pyplot as plt
from ipywidgets import interact
from IPython.display import YouTubeVideo, Markdown, display, Video, Image

from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
pd.set_option('display.max_colwidth', 200)
pd.options.display.float_format = '{:,.2f}'.format

def printmd(string):
    display(Markdown(string))

# Constants

In [205]:
SPEAKER_NAME = 'oliver'
LIST_BULLET = '  ◘ '
TAB = '&nbsp;&nbsp;&nbsp;&nbsp;'
VIDEO_ID_LEN = 11
FRAME_RATE = 15

In [206]:
PATS_DATA_ROOT = '/Users/staveshemesh/Projects/PATS_DATA/'
PATS_INTERVALS_DIR = os.path.join(PATS_DATA_ROOT, f'Processed/{SPEAKER_NAME}/data/processed/{SPEAKER_NAME}/')

INTERVALS_PATH = '/Users/staveshemesh/Projects/shstav2/token_voken/data/20210418_005318/dataframes/df_intervals_oliver.csv'

# /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver
PATS_SPEAKER_VIZ_DIR = os.path.join(PATS_DATA_ROOT, f'Youtube/{SPEAKER_NAME}')

In [158]:
TIMESTR = time.strftime("%Y%m%d_%H%M%S")

OUTPUT_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data/'
OUTPUT_DIR = os.path.join(OUTPUT_ROOT, TIMESTR, 'dataframes')

In [54]:
COLS_VIEW = [
    'speaker', 'interval_id',
    'duration', 'start_time_string', 'end_time_string',
    'video_link'
]

# Helpers

## Resolvers

## Display Utils

In [5]:
def printmd(string):
    display(Markdown(string))

CAPTION_STYLE = {
    'selector': 'caption',
    'props': [
        ('color', 'navy'),
        ('font-size', '16px')
    ]
}

def display_df_with_caption(df, title):
    return df.style.set_caption(title).set_table_styles([CAPTION_STYLE])

def display_value_counts(series, title):
    df_value_counts = series.value_counts().to_frame().head()
    df_style = display_df_with_caption(df_value_counts, title)
    display(df_style)
    
def display_df_info(df):
    print(f'{LIST_BULLET}Videos: #{df["video_link"].nunique():,}')
    print(f'{LIST_BULLET}Intervals: #{df["interval_id"].nunique():,}')
    total_duration = df["duration"].sum()
    total_duration_string = time.strftime('%H hours, %M minutues, %S seconds', time.gmtime(total_duration))
    print(f'{LIST_BULLET}Total Duration: {total_duration_string} ({int(total_duration):,} seconds)')
    all_youtube = df_intervals['video_link'].str.contains('youtube').all()
    print(f'{LIST_BULLET}All are Youtube videos: {all_youtube}')

# Read Data

## df_intervals

In [6]:
df_intervals = pd.read_csv(INTERVALS_PATH)

In [7]:
df_intervals.head(n=2)

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,end_time,org_start_time,org_end_time,max_frames_token,valid_duration,valid_single_token_per_frame,video_downloded,valid_hd5,valid_max_token_duration,valid_frames_count
0,oliver,DRauXXz6t0Y,214428,False,12.21,00:00:58.62,00:01:10.83,http://www.youtube.com/watch?v=DRauXXz6t0Y,Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm,0 days 00:00:58.620000,0 days 00:01:10.830000,0 days 00:00:58.620000,0 days 00:01:10.830000,88,True,True,False,True,False,True
1,oliver,DRauXXz6t0Y,214429,True,6.14,00:01:10.97,00:01:17.11,http://www.youtube.com/watch?v=DRauXXz6t0Y,Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm,0 days 00:01:10.970000,0 days 00:01:17.110000,0 days 00:01:10.970000,0 days 00:01:17.110000,31,True,True,False,True,True,True


In [11]:
printmd(f'**All intervals, Speaker {SPEAKER_NAME}**:')
display_df_info(df_intervals)

**All intervals, Speaker oliver**:

  ◘ Videos: #124
  ◘ Intervals: #4,629
  ◘ Total Duration: 18 hours, 15 minutues, 23 seconds (65,723 seconds)
  ◘ All are Youtube videos: True


## df_intervals_valid

In [40]:
df_intervals_valid = df_intervals[df_intervals['valid']].copy()

In [41]:
printmd(f'**VALID intervals, Speaker {SPEAKER_NAME}**:')
display_df_info(df_intervals_valid)

**VALID intervals, Speaker oliver**:

  ◘ Videos: #123
  ◘ Intervals: #2,118
  ◘ Total Duration: 08 hours, 13 minutues, 25 seconds (29,605 seconds)
  ◘ All are Youtube videos: True


# Download Youtube Videos

## Downaloder

In [42]:
SUCCESS_RETURN_CODE = 0

def run_command(command):
    result = call(command, shell=True)
    success = result == SUCCESS_RETURN_CODE
    status_symbol = '✅ ' if success else '❌'
    print(f'{status_symbol} {command}')

def youtube_downloader(video_id):
    link = f'https://www.youtube.com/watch?v={video_id}'
    output_dir = os.path.join(PATS_SPEAKER_VIZ_DIR, video_id)
    output_path = os.path.join(output_dir, video_id)
    if not(os.path.exists(os.path.dirname(output_dir))):
        os.makedirs(os.path.dirname(output_dir))
    command = 'youtube-dl -f bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio -o {output_path} {link}'.format(link=link, output_path=output_path)
    run_command(command)

## Pending Videos

In [44]:
def get_download_status():
    # want to downloaded
    to_download_video_links = df_intervals_valid['video_link'].unique()
    to_download_video_ids = [lnk[-VIDEO_ID_LEN:] for lnk in to_download_video_links]
    # already downloaded
    already_downloaded_video_ids = !ls {PATS_SPEAKER_VIZ_DIR}
    # assert unique ids
    assert len(to_download_video_links) == len(set(to_download_video_links))
    assert len(already_downloaded_video_ids) == len(set(already_downloaded_video_ids))
    # needs to download
    pending_download = set(already_downloaded_video_ids) - set(to_download_video_ids)
    print(f'Download Status: {len(already_downloaded_video_ids)} / {len(to_download_video_ids)} downloaded, {len(pending_download)} pending.')
    return pending_download, already_downloaded_video_ids

In [45]:
pending_download, already_downloaded_video_ids = get_download_status()

Download Status: 124 / 123 downloaded, 1 pending.


In [46]:
for video_id in pending_download:
    youtube_downloader(video_id)

✅  youtube-dl -f bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio -o /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/-Z668Qc0P4Q/-Z668Qc0P4Q https://www.youtube.com/watch?v=-Z668Qc0P4Q


In [47]:
pending_download, already_downloaded_video_ids = get_download_status()

Download Status: 124 / 123 downloaded, 1 pending.


# Crop Videos

In [49]:
df_intervals_valid['video_downloaded'] = df_intervals_valid['video_id'].isin(already_downloaded_video_ids)

In [55]:
display_value_counts(df_intervals_valid['video_downloaded'], 'Downloaded out of Valids')

Unnamed: 0,video_downloaded
True,2118


In [56]:
df_intervals_valid.sort_values(['video_id', 'start_time'], inplace=True)

In [59]:
df_intervals_valid[COLS_VIEW][40:45]

Unnamed: 0,speaker,interval_id,duration,start_time_string,end_time_string,video_link
4344,oliver,101087,16.53,00:15:06.43,00:15:22.96,http://www.youtube.com/watch?v=7VG_s2PCH_c
4347,oliver,101092,11.07,00:15:29.10,00:15:40.16,http://www.youtube.com/watch?v=7VG_s2PCH_c
4348,oliver,101093,5.67,00:15:40.30,00:15:45.96,http://www.youtube.com/watch?v=7VG_s2PCH_c
4349,oliver,101097,6.47,00:15:47.36,00:15:53.83,http://www.youtube.com/watch?v=7VG_s2PCH_c
3628,oliver,101103,14.0,00:19:48.50,00:20:02.50,http://www.youtube.com/watch?v=FVFdsl29s_Q


## Cropper

In [96]:
def get_video_id(interval_id):
    df_interval = df_intervals_valid[df_intervals_valid['interval_id'] == interval_id]
    return df_interval.iloc[0]['video_id']
    
def resolve_interval_video_path(interval_id):
    video_id = get_video_id(interval_id)
    video_dir = os.path.join(PATS_DATA_ROOT, 'Youtube', SPEAKER_NAME, video_id)    
    interval_path = os.path.join(video_dir, interval_id, f'{interval_id}.mp4')
    return interval_path

def resolve_interval_frames_dir(interval_id, create=True):
    interval_video_path = resolve_interval_video_path(interval_id)
    interval_video_dir = os.path.dirname(interval_video_path)
    inetrval_frames_dir = os.path.join(interval_video_dir, 'frames')
    if create and not os.path.exists(inetrval_frames_dir):
        os.makedirs(inetrval_frames_dir)
    return inetrval_frames_dir

def save_interval(input_fn, start, end, output_fn):
    """
        -strict: Specify how strictly to follow the standards
        -y: Overwrite output files without asking
    """
    command = 'ffmpeg -i "%s" -ss %s -to %s -strict -2 "%s" -y' % (input_fn, start, end, output_fn)
    run_command(command)

def crop_tool(interval_row):
    speaker, video_id, interval_id, start, end = interval_row[['speaker', 'video_id', 'interval_id', 'start_time_string', 'end_time_string']]
    
    video_dir = os.path.join(PATS_SPEAKER_VIZ_DIR, video_id)
    video_path = os.path.join(video_dir, f'{video_id}.mp4')
    
    #TODO: use get_interval_video_path
    interval_dir = os.path.join(video_dir, str(interval_id))
    interval_path = os.path.join(interval_dir, f'{interval_id}.mp4')
    
    if not(os.path.exists(os.path.dirname(interval_path))):
        os.makedirs(os.path.dirname(interval_path))
        save_interval(video_path, start, end, interval_path)

In [None]:
df_intervals_valid[1500:].progress_apply(crop_tool, axis=1);

In [107]:
df_intervals_valid['interval_id'] = df_intervals_valid['interval_id'].astype(str)

In [126]:
df_intervals_valid['interval_video_path'] = \
    PATS_SPEAKER_VIZ_DIR + '/' + \
    df_intervals_valid['video_id'] + '/' + \
    df_intervals_valid['interval_id'] + '/' + \
    df_intervals_valid['interval_id'] + '.mp4'

In [172]:
df_intervals_valid['full_video_path'] = \
    PATS_SPEAKER_VIZ_DIR + '/' + \
    df_intervals_valid['video_id'] + '/' + \
    df_intervals_valid['video_id'] + '.mp4'

In [174]:
df_intervals_valid['full_video_path'].apply(os.path.exists).value_counts()

True     1851
False     267
Name: full_video_path, dtype: int64

In [184]:
df_intervals_valid['full_video_path'].apply(os.path.exists).value_counts()

False    2118
Name: full_video_path, dtype: int64

In [183]:
for video_path in df_intervals_valid['full_video_path'].unique():
    if os.path.exists(video_path):
        print('delete: ', video_path)
        os.remove(video_path)

In [176]:
df_intervals_valid['interval_video_downloaded'].value_counts()

True     1992
False     126
Name: interval_video_downloaded, dtype: int64

In [144]:
df_intervals_valid['interval_frames_dir'] = \
    PATS_SPEAKER_VIZ_DIR + '/' + \
    df_intervals_valid['video_id'] + '/' + \
    df_intervals_valid['interval_id'] + \
    '/vokens/face_annot_224'

In [145]:
df_intervals_valid['interval_video_downloaded'] = df_intervals_valid['interval_video_path'].progress_apply(os.path.exists)

HBox(children=(FloatProgress(value=0.0, max=2118.0), HTML(value='')))




In [146]:
df_intervals_valid['interval_video_path'].head()

3033    /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100912/100912.mp4
3034    /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/100913.mp4
3036    /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100937/100937.mp4
3565    /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/100945/100945.mp4
3567    /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/100958/100958.mp4
Name: interval_video_path, dtype: object

In [159]:
os.makedirs(OUTPUT_DIR)

In [160]:
OUTPUT_DIR

'/Users/staveshemesh/Projects/shstav2/token_voken/data/20210419_220655/dataframes'

In [186]:
output_path = os.path.join(OUTPUT_DIR, 'df_intervals_valid.csv')
df_intervals_valid.to_csv(output_path, index=False, header=True)

In [196]:
df_intervals[df_intervals['interval_id'] == 216104]

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,end_time,org_start_time,org_end_time,max_frames_token,valid_duration,valid_single_token_per_frame,video_downloded,valid_hd5,valid_max_token_duration,valid_frames_count
28,oliver,DRauXXz6t0Y,216104,False,69.34,00:04:32.47,00:05:41.80,http://www.youtube.com/watch?v=DRauXXz6t0Y,Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm,0 days 00:04:32.470000,0 days 00:05:41.800000,0 days 00:04:32.470000,0 days 00:05:41.800000,78,True,True,False,True,False,True


In [202]:
invalid_interval_ids = df_intervals[~df_intervals['valid']]['interval_id'].tolist()
print(f'Invalid Intervals: {len(invalid_interval_ids):,}')

Invalid Intervals: 2,511


In [207]:
PATS_INTERVALS_DIR

'/Users/staveshemesh/Projects/PATS_DATA/Processed/oliver/data/processed/oliver/'

In [209]:
for invalid_interval_id in invalid_interval_ids:
    invalid_interval_path = os.path.join(PATS_INTERVALS_DIR, str(invalid_interval_id) + '.h5')
    os.remove(invalid_interval_path)