# Investigating Discrepancies in Amount of Frames Extracted

Playground for figuring out why OpenCV was not downloading all frames stated to be available in frame count. Ended up using decord library for frame extraction in download_frames_parallel.py

This notebook has not other uses besides trying methods out and exploring the problem mentioned above. 

In [121]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
import pandas as pd
import numpy as np

from data_preprocessing import get_video_frames

In [3]:
#load video clip info dataframe
workspace_path = '/mount/data'
downloads_df = pd.read_csv(workspace_path + '/downloaded_videos.csv')
downloads_df.head()

Unnamed: 0,video_id,url,title,license,relevant,renamed_title,clip_start,clip_end,duration,frames_collected,frames_reported,frames_reported_decord
0,67OIlq2oMt0,https://youtu.be/67OIlq2oMt0,Hawaiian Islands Humpback Whale National Marin...,creativeCommon,True,video_0000.mp4,10,25,266.0,361.0,473.0,473.0
1,EUcMuUBMYJc,https://youtu.be/EUcMuUBMYJc,Tagging expedition: revealing the delicate nur...,creativeCommon,True,video_0001.mp4,44,59,269.0,360.0,441.0,441.0
2,dpSuygz7ZmA,https://youtu.be/dpSuygz7ZmA,Your Earth Is Blue: Disentangling a Humpback W...,creativeCommon,True,video_0002.mp4,5,20,102.0,361.0,398.0,398.0
3,jfPzlFLEuKk,https://youtu.be/jfPzlFLEuKk,"Humpback Whale || Description, Characteristics...",creativeCommon,True,video_0003.mp4,160,175,238.0,450.0,456.0,456.0
4,xGasapMoy4I,https://youtu.be/xGasapMoy4I,Your Earth Is Blue: Humpback Whales,creativeCommon,True,video_0004.mp4,0,15,83.0,451.0,451.0,451.0


# Clean up frame directory (testing)

In [163]:
#remove frames from test frame directory
import os
import glob

# files = glob.glob('/workspace/youtube-humpback-whale-classifier/classification/frames/clip_0000_frame*.jpg')

#DELETING FROM WORKSPACE
files = glob.glob(workspace_path + '/frames/clip*.jpg')

for f in files:
    os.remove(f)

# Save frames as .jpg files in workspace with frame count limit 

`max_frames = 461`

More about Decord: 

- https://towardsdatascience.com/lightning-fast-video-reading-in-python-c1438771c4e6 

- https://github.com/dmlc/decord

- https://medium.com/@haydenfaulkner/extracting-frames-fast-from-a-video-using-opencv-and-python-73b9b7dc9661

In [160]:
downloads_df.head(3)

Unnamed: 0,video_id,url,title,license,relevant,renamed_title,clip_start,clip_end,duration,frames_collected_opencv,frames_reported_opencv,frames_reported_decord
0,67OIlq2oMt0,https://youtu.be/67OIlq2oMt0,Hawaiian Islands Humpback Whale National Marin...,creativeCommon,True,video_0000.mp4,10,25,266.0,361.0,473.0,473.0
1,EUcMuUBMYJc,https://youtu.be/EUcMuUBMYJc,Tagging expedition: revealing the delicate nur...,creativeCommon,True,video_0001.mp4,44,59,269.0,360.0,441.0,441.0
2,dpSuygz7ZmA,https://youtu.be/dpSuygz7ZmA,Your Earth Is Blue: Disentangling a Humpback W...,creativeCommon,True,video_0002.mp4,5,20,102.0,361.0,398.0,398.0


In [133]:
#trying out getting video frames for one video
actual, reported = get_video_frames('video_clip_0002.mp4', max_frames=461, context=cpu(0), resize=(224, 224))

In [4]:
#getting video frames for multiple videos
for i, row in downloads_df.iterrows(): 
    clip = row['renamed_title'].replace('_', '_clip_')
    
    #extract and save frames as .jpg files in frame folder 
    actual, reported = get_video_frames(clip, max_frames=461, context=cpu(0), resize=(224, 224))
    
    #store frame count metrics for bookkeeping
    downloads_df.at[i, ('frames_collected_decord')] = actual
    downloads_df.at[i, ('frames_reported_decord')] = reported

print('Done saving frames.')

363
Done checking frames
461 461


# Testing Out Multinode Processing for Downloading Frames

Source: https://medium.com/@haydenfaulkner/extracting-frames-fast-from-a-video-using-opencv-and-python-73b9b7dc9661

In [159]:
import sys, os
from multiprocessing import Pool, cpu_count
from functools import partial
import cv2
import matplotlib.pyplot as plt

def get_video_frames(video_title, max_frames, context=cpu(0), resize=(224,224)):
    ''' Get individual image frames from video '''
    
    #get clip number for frame naming
    clip_number = video_title.split('_')[2].split('.')[0]
    
    #read video
    vr = VideoReader(workspace_path + '/video_clips/' + video_title, ctx=context, width=resize[0], height=resize[1])
    
    #get batch of frames that matches amount needed
    frame_indices = [i for i in range(len(vr))]

    #save frames as jpg images 
    for i in frame_indices:
        frame = vr[i].asnumpy()
        frame = cv2.resize(frame, resize)
        
        #reorder color channels (will leave out for now)
        #frame = frame[:, :, [2, 1, 0]] 
        
        #save frame image in directory
        plt.imsave(workspace_path + "/frames/" + "/clip_%s_frame_%d.jpg" % (clip_number, i), frame)
    
    
    #return frame numbers to double check functionality
    num_frames_collected = len(frame_indices)
    num_total_frames = len(vr)
    
    return num_frames_collected, num_total_frames

#having frame extraction run in parallel
video_titles = list(downloads_df.renamed_title)[0:5]  
clip_titles = [video.replace('_', '_clip_') for video in video_titles]

print("There are {} CPUs on this machine ".format(cpu_count()))


pool = Pool(cpu_count())

download_frames_func = partial(get_video_frames, max_frames = 461, resize=(224,224))
results = pool.map(download_frames_func, clip_titles)

#terminate worker processes now that parallelizable portion is finished
pool.close()

# wait for the worker processes to terminate.
pool.join()

There are 80 CPUs on this machine 


# Decord Key Indices

In [131]:
#read video + get key indices + frames
# from decord import VideoReader
# vr = VideoReader(workspace_path + '/video_clips/video_clip_0002.mp4', ctx=cpu(0), width=224, height=224)
# key_indices = vr.get_key_indices()
# key_frames =  vr.get_batch(key_indices)
# key_indices

[0, 45, 165, 173, 221, 301, 374]

# Save Changes to Downloads DF

In [118]:
# update csv with frame counts (actual + reported)
downloads_df.to_csv(workspace_path + '/downloaded_videos.csv', index=False)
downloads_df.to_csv('/workspace/youtube-humpback-whale-classifier/data/downloaded_videos.csv', index=False)