In [1]:
import os
import math
import cv2
import pytube as pyt
import time
from datetime import timedelta
import numpy as np
import pandas as pd
#Needs ffmpeg for audio, I think you can add that to AWS directly, if not just zip executable I think

In [2]:
class YoutubePlaylist:
    """class with all the code, initialize with the playlist link, a name,and a folderpath,  as str"""
    def __init__(self,link,name,MainFolder):
        self.Playlist=pyt.Playlist(link)
        self.name=name
        self.streams=[]
        self.MainFolder=MainFolder
        self.mp4folder=None
        self.mp4names=[]
        self.fps=[]
        self.videonames=[]
        self.FrameFolder=None
        self.AudioFolder=None
        if not os.path.exists(self.MainFolder):
            os.mkdir(self.MainFolder)
    #run playlist_download before anything else

    def playlist_download(self,mp4folder='/mp4',start=0,stop=False):
        """Downloads all videos from the linked playlist to a folder Defined by mp4folder. name is for the file names"""
        if not os.path.exists(self.MainFolder+mp4folder):
            os.mkdir(self.MainFolder+mp4folder)
        self.mp4folder=mp4folder
        if not stop:
            rnge=self.Playlist.videos[start:]
        else:
            rnge=self.Playlist.videos[start:stop]
        for number, video in enumerate(rnge):
            try:
                a=video.streams.filter(file_extension='mp4').first()
            except (VideoUnavailable, HTTPError, ConnectionError,RemoteDisconnected, ConnectTimeout, InvalidURL, ProtocolError, AgeRestrictedError, VideoRegionBlocked, LiveStreamError, RecordingUnavailable, MembersOnly, VideoPrivate, Exception):
                try:
                    a=video.streams.filter(file_extension='mp4').first()
                except (VideoUnavailable, HTTPError, ConnectionError,RemoteDisconnected, ConnectTimeout, InvalidURL, ProtocolError, AgeRestrictedError, VideoRegionBlocked, LiveStreamError, RecordingUnavailable, MembersOnly, VideoPrivate, Exception):
                    try:
                        a=video.streams.filter(file_extension='mp4').first()  
                    except:
                        print('Bruh')
            self.streams.append(a)
            self.mp4names.append(f'{self.name}_video{number}.mp4') 
            self.streams[number].download(self.MainFolder+mp4folder,filename=f'{self.name}_video{number}.mp4') #downloads mp4 to folder 
            self.fps.append(self.streams[number].fps)
            self.videonames.append(video.title)
        

    def frame(self,videonumber,time,FrameFolder='/Frames'):
        """downloads single frame from a video. Inputs video number from the playlist, and time in the video in seconds.
        This is mainly for convenience/troubleshooting, should probably just get from whole playlist at once"""
        self.FrameFolder=FrameFolder #be consistent with FrameFolder between functions, didn't think that through
        if not os.path.exists(self.MainFolder+self.FrameFolder):
            os.mkdir(self.MainFolder+self.FrameFolder)
        cam = cv2.VideoCapture(self.MainFolder+self.mp4folder+'/'+self.mp4names[videonumber])
        maxframes=cam.get(7)
        frame_viewed=math.floor(time*self.fps[videonumber])
        if frame_viewed>maxframes:
            print ("max frames exceeded")
        cam.set(1, frame_viewed)
        ret,frame = cam.read()
        filename=f'/{self.name:.3}_video{videonumber:02}_frame{frame_viewed:08}.jpg'
        cv2.imwrite(self.MainFolder+FrameFolder+filename,frame) #might be better format than jpg
        
    def bunch_frame(self,videonumber,times,FrameFolder='/Frames'):
        """same as frame_download_video but takes in list for times (in secodns) instead. Could easily combine them but too lazy now"""
        self.FrameFolder=FrameFolder
        for time in times:
            self.frame(videonumber,time,FrameFolder)
            
    def playlist_frame(self,timestep=5,FrameFolder='/Frames'):
        """downloads frames from each video in playlist at every time step, input timestep and FrameFolder"""
        for videonumber in range(np.size(self.streams)):
            cam = cv2.VideoCapture(self.MainFolder+self.mp4folder+'/'+self.mp4names[videonumber])
            maxtime=cam.get(7)/self.fps[videonumber]
            times=np.arange(0,maxtime,timestep)
            self.bunch_frame(videonumber,times,FrameFolder)
            
    def sound(self,videonumber,timestart,timestop,AudioFolder='/Audio'):
        """Takes videonumber, timestart and stop in seconds, AudioFolder, downloads that as wav"""
        self.AudioFolder=AudioFolder
        source=self.MainFolder+self.mp4folder+'/'+self.mp4names[videonumber]
        cam = cv2.VideoCapture(source)
        maxtime=cam.get(7)/self.fps[videonumber]
        if not os.path.exists(self.MainFolder+AudioFolder):
            os.mkdir(self.MainFolder+AudioFolder)
        start=str(timedelta(seconds=int(timestart)))
        stop= str(timedelta(seconds=int(timestop)))
        if timestop>maxtime:
            print("max time exceeded")
        else:
            audioname=self.MainFolder+AudioFolder+f'/{self.name:.3}_video{videonumber:02}_Audio{timestart*self.fps[videonumber]:08}.wav'
            command=' ffmpeg -y -ss {} -t {} -i {} -ac 2 -f wav {}'.format(start,stop,source,audioname)
            os.system(command)
    def bunch_sound(self,videonumber,timegap=600,timelength=5,starttrim=0,endtrim=0,AudioFolder='/Audio'):
        """Downloads wav of length timelength every timegap from videonumber,can trim off start and end, use integers to be safe idk"""
        source=self.MainFolder+self.mp4folder+'/'+self.mp4names[videonumber]
        cam = cv2.VideoCapture(source)
        maxtime=cam.get(7)/self.fps[videonumber]
        for timestart in range(int(starttrim),int(maxtime-endtrim-timelength-1),int(timegap)):
            self.sound(videonumber,timestart,timestart+timelength,AudioFolder=AudioFolder)
    def playlist_sound(self,timegap=600,timelength=5,starttrim=0,endtrim=0,AudioFolder='/Audio'):
        """"Does what bunch_sound does except interated through every video"""
        parameters=(timegap,timelength,starttrim,endtrim,AudioFolder)
        for videonumber in range(np.size(self.streams)):
            self.bunch_sound(videonumber,*parameters)
        
        

## Set test to True when trying this out. Setting it to False will download all the playlists and take more frequent sampling of time, which will take a while and a lot of space

In [39]:
Test=True
Animals=['dog', 'bird', 'cow', 'cat', 'sheep', 'lion', 'snake' ,'elephant', 'pig','monkey']
Links=['https://www.youtube.com/playlist?list=PLEw5hl9-ByjK0GegNAyxTmyJnaChDLUBn',
       'https://www.youtube.com/playlist?list=PLEw5hl9-ByjISKxRNWsqIZHP6GJrQdDb0',
       'https://www.youtube.com/playlist?list=PLEw5hl9-ByjJYtcLEmEJdYkhaybovwseu',
       'https://www.youtube.com/playlist?list=PLEw5hl9-ByjIYk9eSScf04vh8rHhK_vvv',
       'https://www.youtube.com/playlist?list=PLEw5hl9-ByjLraRvQLhMArkm9z3vaWwjj',
       'https://www.youtube.com/playlist?list=PLEw5hl9-ByjL3s1r80NaLm1A5qT4U-acx,',
       'https://www.youtube.com/playlist?list=PLEw5hl9-ByjKNoS90ftpT9YwX1sg6PFc_',
       'https://www.youtube.com/playlist?list=PLEw5hl9-ByjKZILh4-T082Z15Oar32AJE',
       'https://www.youtube.com/playlist?list=PLEw5hl9-ByjL1RGnNiO5X_EHiw-EV0lo-',
       'https://www.youtube.com/playlist?list=PLEw5hl9-ByjJpLDDrR52_lcGRTTthX-oU']
timegap=10
if Test:
    Animals=[Animals[0],Animals[3]]
    Links=[Links[0],Links[3]]
    timegap=300
things=[]
for i,link in enumerate(Links):
    things.append(YoutubePlaylist(link,Animals[i],f'./{Animals[i]}'))
    things[i].playlist_download(start=0,stop=1)
    things[i].playlist_frame(timestep=600)
    things[i].playlist_sound(timegap=timegap)

In [41]:
#WIP, need to change this format slightly
fm=[]
ad=[]
name=[]
for animal in Animals:
    filepath='./'+animal
    frame_files=os.listdir(filepath+'/Frames')
    audio_files=os.listdir(filepath+'/Audio')
    for i in frame_files:
        for j in audio_files:
            fm.append(i)
            ad.append(j)
            name.append(animal)
d = {'Name':name, 'Audio': ad,'Image':fm}
df=pd.DataFrame(data=d)
df.to_csv('./Data.csv',sep='|')

FileNotFoundError: [Errno 2] No such file or directory: './dog/Frames'