# Description

Code to
- download youtube short videos and create zip archive
- extract audio files and create zip archive

In [1]:
### Installations ###
#####################

# scrape video ids from youtube channel
!pip install scrapetube

# download videos from youtube
!pip install git+https://github.com/pytube/pytube

Collecting scrapetube
  Downloading scrapetube-2.5.0-py3-none-any.whl (5.6 kB)
Installing collected packages: scrapetube
Successfully installed scrapetube-2.5.0
Collecting git+https://github.com/pytube/pytube
  Cloning https://github.com/pytube/pytube to /tmp/pip-req-build-wny3ft7b
  Running command git clone --filter=blob:none --quiet https://github.com/pytube/pytube /tmp/pip-req-build-wny3ft7b
  Resolved https://github.com/pytube/pytube to commit a32fff39058a6f7e5e59ecd06a7467b71197ce35
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytube
  Building wheel for pytube (setup.py) ... [?25l[?25hdone
  Created wheel for pytube: filename=pytube-15.0.0-py3-none-any.whl size=57580 sha256=0de5d5e24d59c4f26972908b955ae14123faaadb3233fa7fb4fd75d4df370ea0
  Stored in directory: /tmp/pip-ephem-wheel-cache-k8mytj3t/wheels/b0/a9/7d/d3579227a695fdd15288c35657b3332ef0d71430ca7f685769
Successfully built pytube
Installing collected packages: pytube
Succe

In [11]:
### Imports ###
###############

# general
import numpy as np
import pandas as pd
import shutil
import os
from tqdm import tqdm

# scrape video ids from youtube channel
import scrapetube

# download from youtube
from pytube import YouTube

# extract audio files
import moviepy.editor as mp

# check video and audio files
from IPython.display import Video
import IPython.display as ipd

In [3]:
### Set data directory
##################

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# set data directory
data_dir = '/content/drive/MyDrive/0_Masterarbeit/2_Pipelines/Data'

Mounted at /content/drive


In [7]:
### Select current brand channel to be scraped ###
##################################################

brand_name = 'porsche'

### Scrape shorts ids from youtube channel

In [8]:
### Dict of channel ids ###
###########################

# need to be extracted from scource html code in browser -> search for channelUrl

channel_id = {'gymshark': 'UCma7hhYJ3bfEhZgw3xl77ww',
              'gopro': 'UCqhnX4jA0A5paNd1v-zEysw',
              'redbull': 'UCblfuW_4rakIf2h6aqANefA',
              'monsterenergy': 'UCg1uYO329KcAEN-PQdoQMKQ',
              'hyundai': 'UC5f97D60yHa7UE9rFfbej8g',
              'porsche': 'UC_BaxRhNREI_V0DVXjXDALA',
              'audi': 'UCO5ujNeWRIwP4DbCZqZWcLw',
              'bmw': 'UCYwrS5QvBY_JbSdbINLey6Q',
              'vw': 'UC0US_GEXVmwMH04OMcNuhpQ',
              'mercedes': 'UClj0L8WZrVydk5xKOscI6-A',
              'honda': 'UC22zQ9nBEk6KOjUWqR5XXZg'}

In [9]:
### Scrape ids ###
##################

# initiate scraping function
shorts_ids_scraper = scrapetube.get_channel(channel_id[brand_name], content_type = 'shorts')

# extract ids, create urls and save in list
shorts_urls = ['https://www.youtube.com/shorts/' + v['videoId'] for v in shorts_ids_scraper]

# check number of extracted urls
len(shorts_urls)

48

### Download youtube short videos and create zip archive

In [13]:
### Download video files ###
############################

missing_url = []
os.makedirs('./Video')

for url in tqdm(shorts_urls):
  try:
    YouTube(url).streams.filter(progressive = True, file_extension='mp4').order_by('resolution').desc().first().download(filename = os.path.join('./Video', url[-11:]) + '.mp4')
  except:
    missing_url.append(url)

100%|██████████| 48/48 [01:51<00:00,  2.33s/it]


In [14]:
# check how many videos couldn't be downloaded
missing_url
len(missing_url)

44

In [None]:
# download the missing ones manually
mu = missing_url[11]
print(mu)
YouTube(mu).streams.filter(progressive = True, file_extension='mp4').order_by('resolution').desc().first().download(filename = os.path.join('./Video', mu[-11:]) + '.mp4')

https://www.youtube.com/shorts/1QAlkGRy-7o


'/content/./Video/1QAlkGRy-7o.mp4'

In [17]:
# check number of videos
vfs = os.listdir('./Video')
len(vfs)

4

In [None]:
# check if all videos are downloaded
(pd.Series([u[-11:] + '.mp4' for u in shorts_urls]).isin(vfs)).sum()

90

In [15]:
# display some videos
#Video(os.path.join('./Video', os.listdir('./Video')[0]), embed=True)

In [None]:
### Create zip archive and save ###
###################################

# zip the data folder
shutil.make_archive(f'./Video_{brand_name}', 'zip', './Video')

# save to gdrive
shutil.copy(f'./Video_{brand_name}.zip', data_dir)

'/content/drive/MyDrive/0_Masterarbeit/2_Pipelines/Data/Video_vw.zip'

### Save dataframe of shorts urls that could be downloaded

In [18]:
# create
downloaded_urls = [os.path.join('https://www.youtube.com/shorts', v[:-4]) for v in vfs]
shorts_ids_df = pd.DataFrame({'video_url': downloaded_urls})
shorts_ids_df.head()

Unnamed: 0,video_url
0,https://www.youtube.com/shorts/QPly0w436ZU
1,https://www.youtube.com/shorts/jV4tD8LHSLE
2,https://www.youtube.com/shorts/wHJrCm3BqjI
3,https://www.youtube.com/shorts/xEHze-I7KfI


In [19]:
# save as csv
shorts_ids_df.to_csv(f'./videos_{brand_name}.csv', index = False)

# save to gdrive
shutil.copy(f'./videos_{brand_name}.csv', data_dir)

### Extract Audio Files from video and create zip archive

In [None]:
### Extract audio from video ###
################################

os.makedirs('./Audio')
for video_file in os.listdir('./Video'):
  audio_file = os.path.join('./Audio', video_file[:-4]) + '.wav'

  with mp.VideoFileClip(os.path.join('./Video', video_file)) as video:
    video.audio.write_audiofile(audio_file, verbose = False)

MoviePy - Writing audio in ./Audio/wQ3e47sxAxc.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/3SVo_-UBqMQ.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/YuWXYfF11pQ.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/18W7-R4YO6Y.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/-3hZEU7JEyA.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/kfBjC3RPkj8.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/w6zGjj4RKBE.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/dck00Zkl_iE.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/6gelvcyL72Q.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/8OVaXsG2z6U.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/7-G6MrxEnyE.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/r3tfxnfwTwU.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/WJaocUu-75w.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/jpZJSwji1NY.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/VnyjINuOnjI.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/MpkTik1_1g4.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/y6rvoBLaThg.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/hpRQ_lxL_14.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/3-NYTP3CHew.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/VqYiY_PgDhk.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/S_cm7QdZ8_4.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/8QWRcnRuhQo.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/Vla2_uWkBHY.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/thZ8Qj9ymJw.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/YmMiqBvPp_Q.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/3E-fdgNUXAc.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/7UsBcQ2QRKo.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/kOry2KlLjOc.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/W741uQ4kQ9A.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/N0M8B96vGuI.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/FWKvuroLRQw.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/1EVCMYWE2es.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/ZwYU_gCnC2Y.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/vKFa9x-ce_I.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/whSrwR7RXq8.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/apK9sr-mnT0.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/Qb5lhWvDL3U.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/RgBsmty5WXI.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/8b6PvEfN8yE.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/_nbOr8pkRps.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/eQJdmyvwqVs.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/lYybRBKyjas.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/jbGcX7E72N0.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/HoscYfFAqm8.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/ko70Hr3m2mY.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/IS-4KkjBB9Q.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/r1ErAvUXJgA.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/inG5tqp0Yrw.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/XmRrJbtnkso.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/CoBvbCCtpKY.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/BrUN0OKXs2s.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/sUHRCJXcvuY.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/GTWc8yz-awg.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/9-CZXS2_RwM.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/Z8I_SftqXow.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/oOMjg-ykKP0.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/6LSmZBQL8s0.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/QQzjdSVHV-4.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/bt5R_qxLfPA.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/wx5bRaXf0QY.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/Uot3LZ_A8k8.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/vW-K7Srnerk.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/lWmFR6GFaAE.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/qf_sy_Rc2Uw.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/3mKgBDhodEU.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/Uhv0jlfp5sE.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/GW72TM5Ow5Y.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/-VMweFwAOX0.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/6BMceg4_3ck.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/OwTA_J9okmU.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/H8XAoHS9uWE.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/Hrm_w9XUge0.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/Q_vKIANm8BQ.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/2fGVdFSuNVU.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/byQGmzpHNmU.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/xpu9jY1GciM.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/PM_e4v10hj4.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/6gcrqsc569s.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/S1ifIzxF0aU.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/9kekQIc1Aew.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/6eyqnR-6OeU.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/USTv89smNBc.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/--hx4ZXA0Xg.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/CiZQ_mSh0ak.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/FoaNJmn637E.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/VI5Oc84JcPc.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/d-0q2g0Gzmk.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/DT_O8YLYQ7M.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/Fiu00wqfTq0.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/YWcAxBLjL5Q.wav


                                                                    

MoviePy - Done.




In [None]:
# check if number of audio files correct
afs = os.listdir('./Audio')
len(afs)

90

In [20]:
# listen to audio example
#ipd.Audio(os.path.join('./Audio', afs[21]))

In [None]:
### Create zip archive and save ###
###################################

# zip the data folder
shutil.make_archive(f'./Audio_{brand_name}', 'zip', './Audio')

# save to gdrive
shutil.copy(f'./Audio_{brand_name}.zip', data_dir)

'/content/drive/MyDrive/0_Masterarbeit/2_Pipelines/Data/Audio_vw.zip'