# Mark Subclip Intervals

This notebook is used to (manually) identify and document a 15-second interval in relevant videos where a humpback whale encounter is evident. 

We repeat a similar process for irrelevant videos, replacing manual interval annotation with a randomly generated 15-second interval. 

The 15-second intervals annotated here will serve as input for training our model. 

In [45]:
import pandas as pd
import numpy as np
import random
import datetime
import cv2

In [41]:
#load df of successful video downloads
workspace_path = '/mount/data'
downloads_df = pd.read_csv(workspace_path + '/downloaded_videos.csv')
downloads_df.head()

Unnamed: 0,video_id,url,title,license,relevant,renamed_title,clip_start,clip_end
0,67OIlq2oMt0,https://youtu.be/67OIlq2oMt0,Hawaiian Islands Humpback Whale National Marin...,creativeCommon,True,video_0000.mp4,0:10,0:25
1,EUcMuUBMYJc,https://youtu.be/EUcMuUBMYJc,Tagging expedition: revealing the delicate nur...,creativeCommon,True,video_0001.mp4,0:44,0:59
2,dpSuygz7ZmA,https://youtu.be/dpSuygz7ZmA,Your Earth Is Blue: Disentangling a Humpback W...,creativeCommon,True,video_0002.mp4,0:05,0:20
3,jfPzlFLEuKk,https://youtu.be/jfPzlFLEuKk,"Humpback Whale || Description, Characteristics...",creativeCommon,True,video_0003.mp4,2:40,2:55
4,xGasapMoy4I,https://youtu.be/xGasapMoy4I,Your Earth Is Blue: Humpback Whales,creativeCommon,True,video_0004.mp4,0:00,0:15


# Add a Column to Store Clip Interval 
(only run this once)

In [33]:
# downloads_df['clip_start'] = [''for i in range(0, len(downloads_df))]
# downloads_df['clip_end'] = ['' for i in range(0, len(downloads_df))]

# downloads_df.head()

Unnamed: 0,video_id,url,title,license,relevant,renamed_title,clip_start,clip_end
0,67OIlq2oMt0,https://youtu.be/67OIlq2oMt0,Hawaiian Islands Humpback Whale National Marin...,creativeCommon,True,video_0000.mp4,,
1,EUcMuUBMYJc,https://youtu.be/EUcMuUBMYJc,Tagging expedition: revealing the delicate nur...,creativeCommon,True,video_0001.mp4,,
2,dpSuygz7ZmA,https://youtu.be/dpSuygz7ZmA,Your Earth Is Blue: Disentangling a Humpback W...,creativeCommon,True,video_0002.mp4,,
3,jfPzlFLEuKk,https://youtu.be/jfPzlFLEuKk,"Humpback Whale || Description, Characteristics...",creativeCommon,True,video_0003.mp4,,
4,xGasapMoy4I,https://youtu.be/xGasapMoy4I,Your Earth Is Blue: Humpback Whales,creativeCommon,True,video_0004.mp4,,


In [81]:
# #add a duration column
# downloads_df['duration'] = [np.NaN for i in range(0, len(downloads_df))]

# for i, row in downloads_df.iterrows():
#     downloads_df.at[i, ('duration')] = get_video_duration(row['renamed_title'])
    

# Relevant Subclip Annotation

In [3]:
#functions to double check relevance of each new potential video
from IPython.display import YouTubeVideo, Image, display, Video, HTML

def display_video(index, video_title, url):
    '''displays video within jupyter nb for viewing + scrolling'''
    print("{}: {}".format(index, video_title))
    print(url)
    videoID = url.split('/')[3]
    display(YouTubeVideo(videoID))

def get_occurence_intervals():
    '''allows user to enter occurence interval start + end time
    
    Output
        start (string): start of video clip (ex. "00:10")
        end   (string): end of video clip (ex. "00:25")
        
    '''

    start = input('Clip Start Time: ')
    end = input('Clip End Time:   ')

    return start, end

def get_split_in_seconds(start, end):
    '''converts clip start time and end time to seconds
    
    Inputs
        start (string): start of video clip (ex. "1:55")
        end   (string): end of video clip (ex. "2:10")
        
    Output
        start_seconds (int): start of video clip in seconds (ex. 115)
        end_seconds   (int): end of video clip in seconds (ex. 130)
    
    '''
    start_seconds = int(start.split(':')[0]) * 60 + int(start.split(':')[1])
    end_seconds = int(end.split(':')[0]) * 60 + int(end.split(':')[1])
    
    return start_seconds, end_seconds

In [38]:
#loop through relevant videos and annotate whale encounter intervals
print('How many videos do you want to annotate?:', end = " ")
num_videos = int(input())

#pickup on the next relevant video w/o a clip interval
video_indices_to_annotate = list(downloads_df[(downloads_df.clip_start.isna()) & (downloads_df.relevant == True)].index)

#annotate clip intervals and record in dataset
for i in video_indices_to_annotate[:num_videos]:
    row = downloads_df.loc[i]
    display_video(i, row['renamed_title'], row['url'])
    clip_start, clip_end = get_occurence_intervals()
    downloads_df.at[i, ('clip_start')] = clip_start
    downloads_df.at[i, ('clip_end')] = clip_end   
    
print(f'Done annotating videos {video_indices_to_annotate[0]} to {video_indices_to_annotate[num_videos-1]} \n')

In [35]:
#checking updates
downloads_df[(downloads_df.clip_start.notna()) & (downloads_df.relevant == True)][-4:]

Unnamed: 0,video_id,url,title,license,relevant,renamed_title,clip_start,clip_end
328,oTuPSptpvEw,https://youtu.be/oTuPSptpvEw,Un baleineau vient a notre rencontre,creativeCommon,True,video_0384.mp4,0:00,0:15
333,Z4YaphCbNcw,https://youtu.be/Z4YaphCbNcw,Honorary doctorate 2021: Prof. Peter L. Tyack ...,creativeCommon,True,video_0391.mp4,0:40,0:55
342,q2m2rUhrfm8,https://youtu.be/q2m2rUhrfm8,Channel Islands National Marine Sanctuary Over...,creativeCommon,True,video_0401.mp4,3:00,3:15
344,NCLpa6hQx-s,https://youtu.be/NCLpa6hQx-s,"Whales.#Киты -великаны,их величие.",creativeCommon,True,video_0403.mp4,0:05,0:20


In [36]:
print(len(downloads_df[(downloads_df.clip_start.isna()) & (downloads_df.relevant == True)]), 'more relevant videos to annotate')

0 more relevant videos to annotate


In [None]:
#[TO DO: ADD CODE TO CONVERT START AND END SPLITS TO SECONDS]

# Irrelevant Video Clip Annotation

Select a random 15 second interval to extract from the video

In [76]:
def get_video_duration(video_title, silent = True):
    '''gets the video's duration in seconds'''
    data = cv2.VideoCapture(workspace_path + '/videos/' + video_title)

    frames = data.get(cv2.CAP_PROP_FRAME_COUNT)
    fps = int(data.get(cv2.CAP_PROP_FPS))
    seconds = int(frames / fps)
    
    return seconds

def get_rand_interval(duration_secs):
    '''gets a random 15 sec interval from video (in seconds).
       We get a random start time that is at least 15 seconds (or more) 
       from the end of the video'''

    if duration_secs - 15 > 0:
        stop = duration_secs - 15
        start_seconds = random.randrange(0, stop)
        end_seconds = start_seconds + 15
        
    else:
        start_seconds = 0
        end_seconds = start_seconds + duration_secs
    
    return start_seconds, end_seconds

In [85]:
# annotate random 15 second interval for irrelevant video clips
for i, row in downloads_df[(downloads_df.clip_start.isna()) & (downloads_df.relevant == False)].iterrows():
    duration = get_video_duration(row['renamed_title'])
    start_secs, end_secs = get_rand_interval(duration)
        
    downloads_df.at[i, ('clip_start')] = start_secs
    downloads_df.at[i, ('clip_end')] = end_secs
    
print('Done generating clip intervals for irrelevant videos.')

[av1 @ 0x33eb800] Your platform doesn't suppport hardware accelerated AV1 decoding.
[av1 @ 0x33eb800] Failed to get pixel format.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] Missing Sequence Header.
[av1 @ 0x33eb800] video_get_buffer: image parameters invalid
[av1 @ 0x33eb800] get_buffer() failed
[av1 @ 0x33eb800] thread_get_buffer() fa

Done generating clip intervals for irrelevant videos.


In [88]:
downloads_df

Unnamed: 0,video_id,url,title,license,relevant,renamed_title,clip_start,clip_end,duration
0,67OIlq2oMt0,https://youtu.be/67OIlq2oMt0,Hawaiian Islands Humpback Whale National Marin...,creativeCommon,True,video_0000.mp4,0:10,0:25,266.0
1,EUcMuUBMYJc,https://youtu.be/EUcMuUBMYJc,Tagging expedition: revealing the delicate nur...,creativeCommon,True,video_0001.mp4,0:44,0:59,269.0
2,dpSuygz7ZmA,https://youtu.be/dpSuygz7ZmA,Your Earth Is Blue: Disentangling a Humpback W...,creativeCommon,True,video_0002.mp4,0:05,0:20,102.0
3,jfPzlFLEuKk,https://youtu.be/jfPzlFLEuKk,"Humpback Whale || Description, Characteristics...",creativeCommon,True,video_0003.mp4,2:40,2:55,238.0
4,xGasapMoy4I,https://youtu.be/xGasapMoy4I,Your Earth Is Blue: Humpback Whales,creativeCommon,True,video_0004.mp4,0:00,0:15,83.0
...,...,...,...,...,...,...,...,...,...
363,K1RzTZI9p5U,https://youtu.be/K1RzTZI9p5U,El Secreto del Sueño: Un mito aborigen austral...,creativeCommon,False,video_0425.mp4,117,132,620.0
364,XIhVygewzsM,https://youtu.be/XIhVygewzsM,The World&#39;s Strangest Capital Cities,creativeCommon,False,video_0427.mp4,47,62,475.0
365,nYGZBz5iqe8,https://youtu.be/nYGZBz5iqe8,#HealyHustle | The Elephant Story - What do El...,creativeCommon,False,video_0428.mp4,139,154,189.0
366,uL8pTh8exNI,https://youtu.be/uL8pTh8exNI,하와이 캠핑 빅아일랜드 스펜서 비치파크 (Feat. 마할로의 시크릿 비치도 공개합니다),creativeCommon,False,video_0429.mp4,283,298,664.0


# Save Changes

In [89]:
# update csv with annotated intervals
downloads_df.to_csv(workspace_path + '/downloaded_videos.csv', index=False)
downloads_df.to_csv('/workspace/youtube-humpback-whale-classifier/data/downloaded_videos.csv', index=False)