# Imports

In [1]:
import os
import sys
import h5py
import time
from subprocess import call
import youtube_dl

import numpy as np
import pandas as pd

import seaborn as sns
from pprint import pprint
import matplotlib.pyplot as plt
from ipywidgets import interact
from IPython.display import YouTubeVideo, Markdown, display, Video, Image

from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
pd.set_option('display.max_colwidth', 200)
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
sys.path.append('/Users/staveshemesh/Projects/shstav2/token_voken/src')
sys.path.append('/Users/staveshemesh/Projects/shstav2/token_voken')
from src.common.setup import syspath_append_projects
syspath_append_projects()
from src.common.path_resolvers import *
from src.common.constants import *
from src.common.display_utils import *
from src.common.commands import *
from src.common.status import *

# Constants

In [4]:
INTERVALS_PATH = '/Users/staveshemesh/Projects/shstav2/token_voken/data/20210418_005318/dataframes/df_intervals_oliver.csv'

In [5]:
# TIMESTR = time.strftime("%Y%m%d_%H%M%S")

# OUTPUT_ROOT = '/Users/staveshemesh/Projects/shstav2/token_voken/data/'
# OUTPUT_DIR = os.path.join(OUTPUT_ROOT, TIMESTR, 'dataframes')

In [6]:
COLS_VIEW = [
    'speaker', 'interval_id',
    'duration', 'start_time_string', 'end_time_string',
    'video_link'
]

# Read Data

## df_intervals

In [7]:
df_intervals = pd.read_csv(INTERVALS_PATH, dtype={'interval_id': object})

In [8]:
df_intervals.head(n=2)

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,end_time,org_start_time,org_end_time,max_frames_token,valid_duration,valid_single_token_per_frame,video_downloded,valid_hd5,valid_max_token_duration,valid_frames_count
0,oliver,DRauXXz6t0Y,214428,False,12.21,00:00:58.62,00:01:10.83,http://www.youtube.com/watch?v=DRauXXz6t0Y,Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm,0 days 00:00:58.620000,0 days 00:01:10.830000,0 days 00:00:58.620000,0 days 00:01:10.830000,88,True,True,False,True,False,True
1,oliver,DRauXXz6t0Y,214429,True,6.14,00:01:10.97,00:01:17.11,http://www.youtube.com/watch?v=DRauXXz6t0Y,Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm,0 days 00:01:10.970000,0 days 00:01:17.110000,0 days 00:01:10.970000,0 days 00:01:17.110000,31,True,True,False,True,True,True


In [9]:
printmd(f'**All intervals, Speaker {SPEAKER_NAME}**:')
display_df_info(df_intervals)

**All intervals, Speaker oliver**:

  ◘ Videos: #124
  ◘ Intervals: #4,629
  ◘ Total Duration: 18 hours, 15 minutues, 23 seconds (65,723 seconds)
  ◘ All are Youtube videos: True


## df_intervals_valid

In [10]:
df_intervals_valid = df_intervals[df_intervals['valid']].copy()

In [11]:
printmd(f'**VALID intervals, Speaker {SPEAKER_NAME}**:')
display_df_info(df_intervals_valid)

**VALID intervals, Speaker oliver**:

  ◘ Videos: #123
  ◘ Intervals: #2,118
  ◘ Total Duration: 08 hours, 13 minutues, 25 seconds (29,605 seconds)
  ◘ All are Youtube videos: True


# Download Youtube Videos

In [12]:
from src.pipeline._2_video_downloader import youtube_downloader

## Downaloder

In [13]:
video_id = '8-hahRWhFvg' # disambiguation
youtube_downloader(video_id)

2021-05-23 21:58:14,996 | INFO : ✅  youtube-dl -f bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio -o /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/8-hahRWhFvg/8-hahRWhFvg https://www.youtube.com/watch?v=8-hahRWhFvg


In [14]:
video_id = 'P8pjd1QEA0c' # disambiguation
youtube_downloader(video_id)

2021-05-23 21:58:16,557 | INFO : ✅  youtube-dl -f bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio -o /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/P8pjd1QEA0c https://www.youtube.com/watch?v=P8pjd1QEA0c


## Pending Videos

In [15]:
def get_download_status():
    # want to downloaded
    to_download_video_links = df_intervals_valid['video_link'].unique()
    to_download_video_ids = [lnk[-VIDEO_ID_LEN:] for lnk in to_download_video_links]
    # already downloaded
    already_downloaded_video_ids = !ls {PATS_SPEAKER_VIZ_DIR}
    # assert unique ids
    assert len(to_download_video_links) == len(set(to_download_video_links))
    assert len(already_downloaded_video_ids) == len(set(already_downloaded_video_ids))
    # needs to download
    pending_download = set(already_downloaded_video_ids) - set(to_download_video_ids)
    print(f'Download Status: {len(already_downloaded_video_ids)} / {len(to_download_video_ids)} downloaded, {len(pending_download)} pending.')
    return pending_download, already_downloaded_video_ids

In [16]:
pending_download, already_downloaded_video_ids = get_download_status()

Download Status: 125 / 123 downloaded, 2 pending.


In [17]:
!ls {PATS_SPEAKER_VIZ_DIR}

[1m[36m-Z668Qc0P4Q[m[m [1m[36m8-hahRWhFvg[m[m [1m[36mIfi9M7DRazI[m[m [1m[36mWA0wKeokWUU[m[m [1m[36mfpbOEoRrHyU[m[m [1m[36mpf1t7cs9dkc[m[m
[1m[36m0Rnq1NpHdmw[m[m [1m[36m8YQ_HGvrHEU[m[m [1m[36mJ6lyURyVz7k[m[m [1m[36mWHCQndalv94[m[m [1m[36mfyVz5vgqBhE[m[m [1m[36mpoL7l-Uk3I8[m[m
[1m[36m1Y1ya-yF35g[m[m [1m[36m92vuuZt7wak[m[m [1m[36mK4NRJoCNHIs[m[m [1m[36mWe1IvUe6KLo[m[m [1m[36mgvZSpET11ZY[m[m [1m[36mqr6ar3xJL_Q[m[m
[1m[36m1ZNZY-gd3K0[m[m [1m[36m9PK-netuhHA[m[m [1m[36mKUdHIatS36A[m[m [1m[36mWpzvaqypav8[m[m [1m[36mh1Lfd1aB9YI[m[m [1m[36mr-ERajkMXw0[m[m
[1m[36m2sWRXr2Yu9g[m[m [1m[36mA-4dIImaodQ[m[m [1m[36mKye2oX-b39E[m[m [1m[36mWyGq6cjcc3Q[m[m [1m[36mhWQiXv0sn9Y[m[m [1m[36mrHFOwlMCdto[m[m
[1m[36m32n4h0kn-88[m[m [1m[36mAJKfs4ZnbNE[m[m [1m[36mLfgSEwjAeno[m[m [1m[36mXEVlyP4_11M[m[m [1m[36mhkYzuHMcP64[m[m [1m[36mrrawNvcF64g[m[m
[1m[36m3FCioWz7aps[m[m 

In [46]:
for video_id in pending_download:
    youtube_downloader(video_id)

✅  youtube-dl -f bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio -o /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/-Z668Qc0P4Q/-Z668Qc0P4Q https://www.youtube.com/watch?v=-Z668Qc0P4Q


In [47]:
pending_download, already_downloaded_video_ids = get_download_status()

Download Status: 124 / 123 downloaded, 1 pending.


# Crop Videos

In [49]:
df_intervals_valid['video_downloaded'] = df_intervals_valid['video_id'].isin(already_downloaded_video_ids)

In [55]:
display_value_counts(df_intervals_valid['video_downloaded'], 'Downloaded out of Valids')

Unnamed: 0,video_downloaded
True,2118


In [56]:
df_intervals_valid.sort_values(['video_id', 'start_time'], inplace=True)

In [59]:
df_intervals_valid[COLS_VIEW][40:45]

Unnamed: 0,speaker,interval_id,duration,start_time_string,end_time_string,video_link
4344,oliver,101087,16.53,00:15:06.43,00:15:22.96,http://www.youtube.com/watch?v=7VG_s2PCH_c
4347,oliver,101092,11.07,00:15:29.10,00:15:40.16,http://www.youtube.com/watch?v=7VG_s2PCH_c
4348,oliver,101093,5.67,00:15:40.30,00:15:45.96,http://www.youtube.com/watch?v=7VG_s2PCH_c
4349,oliver,101097,6.47,00:15:47.36,00:15:53.83,http://www.youtube.com/watch?v=7VG_s2PCH_c
3628,oliver,101103,14.0,00:19:48.50,00:20:02.50,http://www.youtube.com/watch?v=FVFdsl29s_Q


## Cropper

In [18]:
from src.pipeline._3_video_crop import crop_tool

In [15]:
df_intervals[df_intervals['interval_id'] == '101302']

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,end_time,org_start_time,org_end_time,max_frames_token,valid_duration,valid_single_token_per_frame,video_downloded,valid_hd5,valid_max_token_duration,valid_frames_count
2195,oliver,8-hahRWhFvg,101302,True,16.28,00:05:38.10,00:05:54.38,https://www.youtube.com/watch?v=8-hahRWhFvg,Mexican_Elections_-_Last_Week_Tonight_with_John_Oliver_HBO-8-hahRWhFvg.mp4,0 days 00:05:38.100000,0 days 00:05:54.380000,0 days 00:05:38.100000,0 days 00:05:54.380000,54,True,True,False,True,True,True


In [19]:
crop_tool(df_intervals[df_intervals['interval_id'] == '101302'].iloc[0])

2021-05-16 23:28:08,230 | INFO : ✅  ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/8-hahRWhFvg/8-hahRWhFvg.mp4" -ss 00:05:38.10 -to 00:05:54.38 -strict -2 "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/8-hahRWhFvg/101302/101302.mp4" -y


In [21]:
crop_tool(df_intervals[df_intervals['interval_id'] == '101678'].iloc[0])

2021-05-17 11:35:00,090 | INFO : ✅  ffmpeg -i "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/P8pjd1QEA0c.mp4" -ss 00:12:29.51 -to 00:12:40.26 -strict -2 "/Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/101678/101678.mp4" -y


In [14]:
interval_row = df_intervals[df_intervals['interval_id'] == '101252'].iloc[0]
interval_row[['speaker', 'video_id', 'interval_id', 'start_time_string', 'end_time_string']]

speaker                   oliver
video_id             BcR_Wg42dv8
interval_id               101252
start_time_string    00:07:28.56
end_time_string      00:07:38.36
Name: 2086, dtype: object

In [11]:
df_intervals[df_intervals['interval_id'] == '101252']['video_id']

2086    BcR_Wg42dv8
Name: video_id, dtype: object

In [None]:
df_intervals_valid[1500:].progress_apply(crop_tool, axis=1);

In [107]:
df_intervals_valid['interval_id'] = df_intervals_valid['interval_id'].astype(str)

In [126]:
df_intervals_valid['interval_video_path'] = \
    PATS_SPEAKER_VIZ_DIR + '/' + \
    df_intervals_valid['video_id'] + '/' + \
    df_intervals_valid['interval_id'] + '/' + \
    df_intervals_valid['interval_id'] + '.mp4'

In [172]:
df_intervals_valid['full_video_path'] = \
    PATS_SPEAKER_VIZ_DIR + '/' + \
    df_intervals_valid['video_id'] + '/' + \
    df_intervals_valid['video_id'] + '.mp4'

In [174]:
df_intervals_valid['full_video_path'].apply(os.path.exists).value_counts()

True     1851
False     267
Name: full_video_path, dtype: int64

In [184]:
df_intervals_valid['full_video_path'].apply(os.path.exists).value_counts()

False    2118
Name: full_video_path, dtype: int64

In [183]:
for video_path in df_intervals_valid['full_video_path'].unique():
    if os.path.exists(video_path):
        print('delete: ', video_path)
        os.remove(video_path)

In [176]:
df_intervals_valid['interval_video_downloaded'].value_counts()

True     1992
False     126
Name: interval_video_downloaded, dtype: int64

In [144]:
df_intervals_valid['interval_frames_dir'] = \
    PATS_SPEAKER_VIZ_DIR + '/' + \
    df_intervals_valid['video_id'] + '/' + \
    df_intervals_valid['interval_id'] + \
    '/vokens/face_annot_224'

In [145]:
df_intervals_valid['interval_video_downloaded'] = df_intervals_valid['interval_video_path'].progress_apply(os.path.exists)

HBox(children=(FloatProgress(value=0.0, max=2118.0), HTML(value='')))




In [146]:
df_intervals_valid['interval_video_path'].head()

3033    /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100912/100912.mp4
3034    /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100913/100913.mp4
3036    /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/Tt-mpuR_QHQ/100937/100937.mp4
3565    /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/100945/100945.mp4
3567    /Users/staveshemesh/Projects/PATS_DATA/Youtube/oliver/P8pjd1QEA0c/100958/100958.mp4
Name: interval_video_path, dtype: object

In [159]:
os.makedirs(OUTPUT_DIR)

In [160]:
OUTPUT_DIR

'/Users/staveshemesh/Projects/shstav2/token_voken/data/20210419_220655/dataframes'

In [186]:
output_path = os.path.join(OUTPUT_DIR, 'df_intervals_valid.csv')
df_intervals_valid.to_csv(output_path, index=False, header=True)

In [196]:
df_intervals[df_intervals['interval_id'] == 216104]

Unnamed: 0,speaker,video_id,interval_id,valid,duration,start_time_string,end_time_string,video_link,video_fn,start_time,end_time,org_start_time,org_end_time,max_frames_token,valid_duration,valid_single_token_per_frame,video_downloded,valid_hd5,valid_max_token_duration,valid_frames_count
28,oliver,DRauXXz6t0Y,216104,False,69.34,00:04:32.47,00:05:41.80,http://www.youtube.com/watch?v=DRauXXz6t0Y,Abortion_Laws_-_Last_Week_Tonight_with_John_Oliver_HBO-DRauXXz6t0Y.webm,0 days 00:04:32.470000,0 days 00:05:41.800000,0 days 00:04:32.470000,0 days 00:05:41.800000,78,True,True,False,True,False,True


In [202]:
invalid_interval_ids = df_intervals[~df_intervals['valid']]['interval_id'].tolist()
print(f'Invalid Intervals: {len(invalid_interval_ids):,}')

Invalid Intervals: 2,511


In [207]:
PATS_INTERVALS_DIR

'/Users/staveshemesh/Projects/PATS_DATA/Processed/oliver/data/processed/oliver/'

In [209]:
for invalid_interval_id in invalid_interval_ids:
    invalid_interval_path = os.path.join(PATS_INTERVALS_DIR, str(invalid_interval_id) + '.h5')
    os.remove(invalid_interval_path)