<a href="https://colab.research.google.com/github/sebi061/VideoAdEngagement/blob/main/0_Downloaded_Data/1_Video_and_audio_data_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description

Code to
- download youtube short videos and create zip archive
- extract audio files and create zip archive

In [None]:
### Installations ###
#####################

# scrape video ids from youtube channel
!pip install scrapetube

# download videos from youtube
!pip install git+https://github.com/pytube/pytube

Collecting scrapetube
  Downloading scrapetube-2.5.1-py3-none-any.whl (5.6 kB)
Installing collected packages: scrapetube
Successfully installed scrapetube-2.5.1
Collecting git+https://github.com/pytube/pytube
  Cloning https://github.com/pytube/pytube to /tmp/pip-req-build-quva3183
  Running command git clone --filter=blob:none --quiet https://github.com/pytube/pytube /tmp/pip-req-build-quva3183
  Resolved https://github.com/pytube/pytube to commit a32fff39058a6f7e5e59ecd06a7467b71197ce35
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pytube
  Building wheel for pytube (setup.py) ... [?25l[?25hdone
  Created wheel for pytube: filename=pytube-15.0.0-py3-none-any.whl size=57580 sha256=5ccfa6e8a2521d1d2fe208d1be5817d2ddbeea1fa66b5a0d18ccab2ec633dc6d
  Stored in directory: /tmp/pip-ephem-wheel-cache-m8b85p1n/wheels/b0/a9/7d/d3579227a695fdd15288c35657b3332ef0d71430ca7f685769
Successfully built pytube
Installing collected packages: pytube
Succe

In [None]:
### Imports ###
###############

# general
import numpy as np
import pandas as pd
import shutil
import os
from tqdm import tqdm

# scrape video ids from youtube channel
import scrapetube

# download from youtube
from pytube import YouTube

# extract audio files
import moviepy.editor as mp

# check video and audio files
from IPython.display import Video
import IPython.display as ipd

In [None]:
### Set data directory
##################

# connect to drive
from google.colab import drive
drive.mount('/content/drive')

# set data directory
save_dir = '/content/drive/MyDrive/VideoAdEngagement'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
### Select current brand channel to be scraped ###
##################################################

brand_name = 'porsche'

### Scrape shorts ids from youtube channel

In [None]:
### Dict of channel ids ###
###########################

# need to be extracted from scource html code in browser -> search for channelUrl

channel_id = {'gymshark': 'UCma7hhYJ3bfEhZgw3xl77ww',
              'gopro': 'UCqhnX4jA0A5paNd1v-zEysw',
              'redbull': 'UCblfuW_4rakIf2h6aqANefA',
              'monsterenergy': 'UCg1uYO329KcAEN-PQdoQMKQ',
              'hyundai': 'UC5f97D60yHa7UE9rFfbej8g',
              'porsche': 'UC_BaxRhNREI_V0DVXjXDALA',
              'audi': 'UCO5ujNeWRIwP4DbCZqZWcLw',
              'bmw': 'UCYwrS5QvBY_JbSdbINLey6Q',
              'vw': 'UC0US_GEXVmwMH04OMcNuhpQ',
              'mercedes': 'UClj0L8WZrVydk5xKOscI6-A',
              'honda': 'UC22zQ9nBEk6KOjUWqR5XXZg'}

In [None]:
### Scrape ids ###
##################

# initiate scraping function
shorts_ids_scraper = scrapetube.get_channel(channel_id[brand_name], content_type = 'shorts')

# extract ids, create urls and save in list
shorts_urls = ['https://www.youtube.com/shorts/' + v['videoId'] for v in shorts_ids_scraper]

# check number of extracted urls
len(shorts_urls)

58

### Download youtube short videos and create zip archive

In [None]:
### Download video files ###
############################

missing_url = []
os.makedirs('./Video')

for url in tqdm(shorts_urls):
  try:
    YouTube(url).streams.filter(progressive = True, file_extension='mp4').order_by('resolution').desc().first().download(filename = os.path.join('./Video', url[-11:]) + '.mp4')
  except:
    missing_url.append(url)

100%|██████████| 58/58 [04:59<00:00,  5.17s/it]


In [None]:
# check how many videos couldn't be downloaded
missing_url
len(missing_url)

0

In [None]:
### download the missing ones manually -> sometime multiple attempts help (depening on the state of pytube)
# mu = missing_url[0]
# print(mu)
# YouTube(mu).streams.filter(progressive = True, file_extension='mp4').order_by('resolution').desc().first().download(filename = os.path.join('./Video', mu[-11:]) + '.mp4')

In [None]:
# check number of videos
vfs = os.listdir('./Video')
len(vfs)

58

In [None]:
# check if all videos are downloaded
(pd.Series([u[-11:] + '.mp4' for u in shorts_urls]).isin(vfs)).sum()

58

In [None]:
# display some videos
#Video(os.path.join('./Video', os.listdir('./Video')[0]), embed=True)

In [None]:
### Create zip archive and save ###
###################################

# zip the data folder
shutil.make_archive(f'./Video_{brand_name}', 'zip', './Video')

# save to gdrive
shutil.copy(f'./Video_{brand_name}.zip', os.path.join(save_dir, "0_Downloaded_Data", "1_Raw_Video_Data"))

'/content/drive/MyDrive/VideoAdEngagement/0_Downloaded_Data/1_Raw_Video_Data/Video_porsche.zip'

### Save dataframe of shorts urls that could be downloaded

In [None]:
# create
downloaded_urls = [os.path.join('https://www.youtube.com/shorts', v[:-4]) for v in vfs]
shorts_ids_df = pd.DataFrame({'video_url': downloaded_urls})
shorts_ids_df.head()

Unnamed: 0,video_url
0,https://www.youtube.com/shorts/YQEqtLoif2o
1,https://www.youtube.com/shorts/a6nnck41MKA
2,https://www.youtube.com/shorts/RUi_uKEp6lM
3,https://www.youtube.com/shorts/YweEKAXaInY
4,https://www.youtube.com/shorts/Yf0a0zqY-cU


In [None]:
# save as csv
shorts_ids_df.to_csv(f'./videos_{brand_name}.csv', index = False)

# save to gdrive
shutil.copy(f'./videos_{brand_name}.csv', os.path.join(save_dir, "0_Downloaded_Data", "0_Video_Ad_Urls"))

'/content/drive/MyDrive/VideoAdEngagement/0_Downloaded_Data/0_Video_Ad_Urls/videos_porsche.csv'

### Extract Audio Files from video and create zip archive

In [None]:
### Extract audio from video ###
################################

os.makedirs('./Audio')
for video_file in os.listdir('./Video'):
  audio_file = os.path.join('./Audio', video_file[:-4]) + '.wav'

  with mp.VideoFileClip(os.path.join('./Video', video_file)) as video:
    video.audio.write_audiofile(audio_file, verbose = False)

MoviePy - Writing audio in ./Audio/YQEqtLoif2o.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/a6nnck41MKA.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/RUi_uKEp6lM.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/YweEKAXaInY.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/Yf0a0zqY-cU.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/q_XGF9wQA0s.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/2DKZIleqOSg.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/epqeHyg06mU.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/GLWgNHjiFXY.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/lhkpVUB-omw.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/HIO7R7WrQnk.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/_MeBQyqf888.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/BiyEuiqtVNs.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/KYeShivFC38.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/QPly0w436ZU.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/SanMXolBVjY.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/qrReywyb3Yw.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/jV4tD8LHSLE.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/hWHTTuEhjN8.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/TspJbmhJveg.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/2xsUBcgQT_Y.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/ysBvQRc8sKA.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/F6TpEb6peRc.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/KWHUoVF4ON0.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/oUdtDstNnJA.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/aszlqn4k6_s.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/xEHze-I7KfI.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/Sb4axb02nnQ.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/v5zf-ld4HTo.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/gkjJEn2hN8Y.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/VKD0M2xzoNQ.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/wHJrCm3BqjI.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/puX5V0Qk5hg.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/KnS9enCh2JI.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/UcWJjJgD1Bg.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/ZI3KJGfJpIk.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/X282GWdRI9k.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/8QLXS6z4jQc.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/o9KjbupIpKM.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in ./Audio/TbuG4Hybg1k.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/lF6feWtyyhc.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/yiIT7M757u4.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/T4ikpe5DevM.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/aaYchECtLkE.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/CjHSB9nAChA.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/2IiEZ-UgL8E.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/4r3oUo5v0A8.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/azRxaFN-Sbo.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/123S66s_AS8.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/QwbcvvFJk1o.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/Cbeu06a4tcM.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/adTzHTXI2iU.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/8Y9feN1DWi4.wav


                                                                    

MoviePy - Done.




MoviePy - Writing audio in ./Audio/qSERBorSqj0.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/JowZED5_gq4.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/G9gM82tqCcw.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/9lVFr5xniS4.wav




MoviePy - Done.
MoviePy - Writing audio in ./Audio/MaxVKhZvEJ8.wav


                                                                      

MoviePy - Done.




In [None]:
# check if number of audio files correct
afs = os.listdir('./Audio')
len(afs)

58

In [None]:
# listen to audio example
#ipd.Audio(os.path.join('./Audio', afs[21]))

In [None]:
### Create zip archive and save ###
###################################

# zip the data folder
shutil.make_archive(f'./Audio_{brand_name}', 'zip', './Audio')

# save to gdrive
shutil.copy(f'./Audio_{brand_name}.zip', os.path.join(save_dir, "0_Downloaded_Data", "2_Raw_Audio_Data"))

'/content/drive/MyDrive/VideoAdEngagement/0_Downloaded_Data/2_Raw_Audio_Data/Audio_porsche.zip'