# Investigating Discrepancies in Amount of Frames Extracted

In [121]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import pandas as pd
import numpy as np

from data_preprocessing import get_video_frames

In [3]:
#load video clip info dataframe
workspace_path = '/mount/data'
downloads_df = pd.read_csv(workspace_path + '/downloaded_videos.csv')
downloads_df.head()

Unnamed: 0,video_id,url,title,license,relevant,renamed_title,clip_start,clip_end,duration,frames_collected,frames_reported,frames_reported_decord
0,67OIlq2oMt0,https://youtu.be/67OIlq2oMt0,Hawaiian Islands Humpback Whale National Marin...,creativeCommon,True,video_0000.mp4,10,25,266.0,361.0,473.0,473.0
1,EUcMuUBMYJc,https://youtu.be/EUcMuUBMYJc,Tagging expedition: revealing the delicate nur...,creativeCommon,True,video_0001.mp4,44,59,269.0,360.0,441.0,441.0
2,dpSuygz7ZmA,https://youtu.be/dpSuygz7ZmA,Your Earth Is Blue: Disentangling a Humpback W...,creativeCommon,True,video_0002.mp4,5,20,102.0,361.0,398.0,398.0
3,jfPzlFLEuKk,https://youtu.be/jfPzlFLEuKk,"Humpback Whale || Description, Characteristics...",creativeCommon,True,video_0003.mp4,160,175,238.0,450.0,456.0,456.0
4,xGasapMoy4I,https://youtu.be/xGasapMoy4I,Your Earth Is Blue: Humpback Whales,creativeCommon,True,video_0004.mp4,0,15,83.0,451.0,451.0,451.0


# Clean up frame directory (testing)

In [141]:
#remove frames from test frame directory
import os
import glob

# files = glob.glob('/workspace/youtube-humpback-whale-classifier/classification/frames/clip_0000_frame*.jpg')

#DELETING FROM WORKSPACE
files = glob.glob(workspace_path + '/frames/clip*.jpg')

for f in files:
    os.remove(f)

# Save frames as .jpg files in workspace with frame count limit 

`max_frames = 461`

More about Decord: 

- https://towardsdatascience.com/lightning-fast-video-reading-in-python-c1438771c4e6 

- https://github.com/dmlc/decord

- https://medium.com/@haydenfaulkner/extracting-frames-fast-from-a-video-using-opencv-and-python-73b9b7dc9661

In [120]:
downloads_df.head(3)

Unnamed: 0,video_id,url,title,license,relevant,renamed_title,clip_start,clip_end,duration,frames_collected_opencv,frames_reported_opencv,frames_reported_decord
0,67OIlq2oMt0,https://youtu.be/67OIlq2oMt0,Hawaiian Islands Humpback Whale National Marin...,creativeCommon,True,video_0000.mp4,10,25,266.0,361.0,473.0,473.0
1,EUcMuUBMYJc,https://youtu.be/EUcMuUBMYJc,Tagging expedition: revealing the delicate nur...,creativeCommon,True,video_0001.mp4,44,59,269.0,360.0,441.0,441.0
2,dpSuygz7ZmA,https://youtu.be/dpSuygz7ZmA,Your Earth Is Blue: Disentangling a Humpback W...,creativeCommon,True,video_0002.mp4,5,20,102.0,361.0,398.0,398.0


In [None]:
downloads_df['frames_collected_decord'] = [np.NaN for i in range(len(downloads_df))]
downloads_df['frames_reported_decord'] = [np.NaN for i in range(len(downloads_df))]

In [4]:
from decord import cpu, gpu

for i, row in downloads_df.iterrows(): 
    clip = row['renamed_title'].replace('_', '_clip_')
    
    #extract and save frames as .jpg files in frame folder 
    actual, reported = get_video_frames(clip, max_frames=461, context=cpu(0), resize=(224, 224))
    
    #store frame count metrics for bookkeeping
    downloads_df.at[i, ('frames_collected_decord')] = actual
    downloads_df.at[i, ('frames_reported_decord')] = reported

print('Done saving frames.')

363
Done checking frames
461 461


In [133]:
actual, reported = get_video_frames('video_clip_0002.mp4', max_frames=461, context=cpu(0), resize=(224, 224))

# Multinode Processing for Downloading Frames

Source: https://medium.com/@haydenfaulkner/extracting-frames-fast-from-a-video-using-opencv-and-python-73b9b7dc9661

In [134]:
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing

In [137]:
import tensorflow as tf
print("Num GPUs available: ", len(tf.config.list_physical_devices('GPU')))
print("Num CPUs available: ", len(tf.config.list_physical_devices('CPU')))

Num GPUs available:  2
Num CPUs available:  1


In [139]:
multiprocessing.cpu_count()

80

In [140]:
# execute across multiple cpu cores to speed up processing, get the count automatically
max_frames = 461
context = cpu(0)
resize=(224,224)

with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:

    # submit the processes: extract_frames(...)
    futures = [executor.submit(get_video_frames, video.replace('_', '_clip_'), max_frames, context, resize)
               for video in downloads_df.renamed_title] 

    for i, f in enumerate(as_completed(futures)):  # as each process completes
        print(i , f)  # print it's progress

0 <Future at 0x7f708d534c70 state=finished raised PicklingError>
1 <Future at 0x7f708d550f40 state=finished raised PicklingError>
2 <Future at 0x7f708d5553d0 state=finished raised PicklingError>
3 <Future at 0x7f708d5554f0 state=finished raised PicklingError>
4 <Future at 0x7f708d5555e0 state=finished raised PicklingError>
5 <Future at 0x7f708d5556d0 state=finished raised PicklingError>
6 <Future at 0x7f708d5557c0 state=finished raised PicklingError>
7 <Future at 0x7f708d5558b0 state=finished raised PicklingError>
8 <Future at 0x7f708d5559a0 state=finished raised PicklingError>
9 <Future at 0x7f708d555a90 state=finished raised PicklingError>
10 <Future at 0x7f708d555b80 state=finished raised PicklingError>
11 <Future at 0x7f708d555c70 state=finished raised PicklingError>
12 <Future at 0x7f708d555d60 state=finished raised PicklingError>
13 <Future at 0x7f708d555e50 state=finished raised PicklingError>
14 <Future at 0x7f708d555f40 state=finished raised PicklingError>
15 <Future at 0x7f72

Process ForkProcess-75:
Process ForkProcess-33:
Process ForkProcess-28:
Process ForkProcess-78:
Process ForkProcess-4:
Traceback (most recent call last):
Process ForkProcess-44:
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Process ForkProcess-22:
Process ForkProcess-67:
Process ForkProcess-2:
  File "/usr/lib/python3.8/concurrent/futures/process.py", line 233, in _process_worker
    call_item = call_queue.get(block=True)
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 96, in get
    with self._rlock:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstra

KeyboardInterrupt: 

# Decord Key Indices

In [131]:
#read video + get key indices + frames
# from decord import VideoReader
# vr = VideoReader(workspace_path + '/video_clips/video_clip_0002.mp4', ctx=cpu(0), width=224, height=224)
# key_indices = vr.get_key_indices()
# key_frames =  vr.get_batch(key_indices)
# key_indices

[0, 45, 165, 173, 221, 301, 374]

# Save Changes to Downloads DF

In [118]:
# update csv with frame counts (actual + reported)
downloads_df.to_csv(workspace_path + '/downloaded_videos.csv', index=False)
downloads_df.to_csv('/workspace/youtube-humpback-whale-classifier/data/downloaded_videos.csv', index=False)