In [1]:
# import youtube_dl
import yt_dlp
import re
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
WAV_DIR = 'wav_files/'
genre_dict = {
            '/m/064t9': 'Pop_music',
            '/m/0glt670': 'Hip_hop_music',
            '/m/06by7': 'Rock_music',
            '/m/06j6l': 'Rhythm_blues',
            '/m/06cqb': 'Reggae',
            '/m/0y4f8': 'Vocal',
            '/m/07gxw': 'Techno',
            }

genre_set = set(genre_dict.keys())

In [3]:
temp_str = []
with open('data-files/csv_files/unbalanced_train_segments.csv', 'r') as f:
    temp_str = f.readlines()

In [4]:
data = np.ones(shape=(1,4)) 
for line in tqdm(temp_str):
    line = re.sub('\s?"', '', line.strip())
    elements = line.split(',')
    common_elements = list(genre_set.intersection(elements[3:]))
    if  common_elements != []:
        data = np.vstack([data, np.array(elements[:3]
                                         + [genre_dict[common_elements[0]]]).reshape(1, 4)])

df = pd.DataFrame(data[1:], columns=['url', 'start_time', 'end_time', 'class_label'])

100%|██████████| 2041792/2041792 [01:04<00:00, 31694.62it/s]


In [5]:
print(df.shape)
df.head()
df['class_label'].value_counts()

(52107, 4)


class_label
Techno           17017
Pop_music         8489
Rock_music        8175
Hip_hop_music     7618
Rhythm_blues      4474
Vocal             3498
Reggae            2836
Name: count, dtype: int64

In [6]:
# Remove 10k Techno audio clips - to make the data more balanced
np.random.seed(10)
drop_indices = np.random.choice(df[df['class_label'] == 'Techno'].index, size=10000, replace=False)
df.drop(labels=drop_indices, axis=0, inplace=True)
df.reset_index(drop=True, inplace=False)

# Time to INT 
df['start_time'] = df['start_time'].map(lambda x: np.int32(float(x)))
df['end_time'] = df['end_time'].map(lambda x: np.int32(float(x)))

In [7]:
df['class_label'].value_counts()

class_label
Pop_music        8489
Rock_music       8175
Hip_hop_music    7618
Techno           7017
Rhythm_blues     4474
Vocal            3498
Reggae           2836
Name: count, dtype: int64

Example:<br>
Step 1:<br>
`ffmpeg -ss 5 -i $(youtube-dl -f 140 --get-url 'https://www.youtube.com/embed/---1_cCGK4M') -t 10 -c:v copy -c:a copy test.mp4`<br>
Starting time is 5 seconds, duration is 10s.

Refer: https://github.com/rg3/youtube-dl/issues/622

Step 2:<br>
`ffmpeg -i test.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 1 output.wav` <br>
PCM-16, 44k sampling, 1-channel (Mono)
<br>
Refer: https://superuser.com/questions/609740/extracting-wav-from-mp4-while-preserving-the-highest-possible-quality

In [8]:
for i, row in tqdm(df.iterrows()):
    url = "'https://www.youtube.com/embed/" + row['url'] + "'"
    file_name = str(i)+"_"+row['class_label']
    
    try:
        command_1 = "ffmpeg -ss " + str(row['start_time']) + " -i $(yt-dlp --verbose -f 140 --get-url " +\
                    url + ") -t 10 -c:v copy -c:a copy " + file_name + ".mp4"

        command_2 = "ffmpeg -i "+ file_name +".mp4 -vn -acodec pcm_s16le -ar 44100 -ac 1 " + WAV_DIR + file_name + ".wav"

        command_3 = 'rm ' + file_name + '.mp4' 

        # Run the 3 commands
        os.system(command_1 + ';' + command_2 + ';' + command_3 + ';')
    
    except:
        print(i, url)
        pass

0it [00:00, ?it/s][debug] Command-line config: ['--verbose', '-f', '140', '--get-url', 'https://www.youtube.com/embed/--1rvyPa8UM']
[debug] Encodings: locale UTF-8, fs utf-8, pref UTF-8, out utf-8 (No ANSI), error utf-8 (No ANSI), screen utf-8 (No ANSI)
[debug] yt-dlp version stable@2024.04.09 from yt-dlp/yt-dlp [ff0779267] (pip)
[debug] Python 3.10.13 (CPython arm64 64bit) - macOS-14.3-arm64-arm-64bit (OpenSSL 3.2.0 23 Nov 2023)
[debug] exe versions: ffmpeg 7.0 (setts), ffprobe 7.0
[debug] Optional libraries: Cryptodome-3.20.0, brotli-1.1.0, certifi-2024.02.02, mutagen-1.47.0, requests-2.31.0, sqlite3-3.44.2, urllib3-2.2.1, websockets-12.0
[debug] Proxy map: {}
[debug] Request Handlers: urllib, requests, websockets
[debug] Loaded 1810 extractors
[youtube] Extracting URL: https://www.youtube.com/embed/--1rvyPa8UM
[youtube] --1rvyPa8UM: Downloading webpage
[youtube] --1rvyPa8UM: Downloading ios player API JSON
[youtube] --1rvyPa8UM: Downloading android player API JSON
[youtube] --1rvyPa