# 06 Getting Data

In this notebook, we will go over several ways of getting audio data.

## YouTube

In [1]:
from pytube import YouTube, Search
from datasets import Audio, load_dataset
from IPython.display import Audio as player
from pprint import pprint
from glob import glob
import pandas as pd

In [None]:
romeo = YouTube("https://youtu.be/p7ff5EntWsE", use_oauth=True, allow_oauth_cache=True)
# yt = YouTube("http://youtube.com/watch?v=2lAe1cqCOXo")

In [None]:
s = Search("Romeo Santos")

In [None]:
len(s.results)

In [None]:
romeo.thumbnail_url

In [None]:
print(romeo.streams)

In [None]:
st = (
    romeo.streams
         .filter(
            adaptive=True,
            only_audio=True
         )
)

In [None]:
list(st)

In [None]:
stream = romeo.streams.get_by_itag('140')

In [None]:
stream.download(
        output_path="../data/yt_playlists/bachata/",
        filename="romeo_santos_suegra.mp3"
    )

In [None]:
sample = "/home/ramonperez/Tresors/datascience/challenges/qdrant_chl/notebooks/../data/yt_playlists/bachata/romeo_santos_suegra.mp3"

In [None]:
suegra = load_dataset("audiofolder", data_dir="../data/yt_playlists/", split="train")
suegra

In [None]:
sample = suegra['audio'][0]
sample

In [None]:
player(
    sample['array'],
    rate=44_100,
)

## Part 2 - Ludwig Data Prep

In [9]:
music_data = load_dataset("audiofolder", data_dir="../data/ludwig_music_data/mp3/", split="train").shuffle(seed=42).select(range(200))
music_data

Resolving data files:   0%|          | 0/11294 [00:00<?, ?it/s]

Found cached dataset audiofolder (/home/ramonperez/.cache/huggingface/datasets/audiofolder/default-f8fe78b3c849ea22/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


Dataset({
    features: ['audio', 'label'],
    num_rows: 200
})

In [3]:
def get_the_id(data):
    data['idx'] = data['audio']['path'].split("/")[-1].replace(".mp3", '')
    return data

In [10]:
music_data[0]['audio']['path'].split("/")[-1].replace(".mp3", '')

'2FFXB4QJmzuwtRxj2G5IUP'

In [11]:
music_data = music_data.map(get_the_id)
music_data

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'label', 'idx'],
    num_rows: 200
})

In [12]:
music_data.to_pandas().label.unique()

array([4, 9, 0, 3, 6, 7, 2, 5, 1, 8])

In [13]:
labels = pd.read_json("../data/ludwig_music_data/labels.json")

In [14]:
labels.head(100)['tracks'].iloc[80]

{'otherSubgenres': {'L': [{'S': 'pop---vocal'}]},
 'artist': {'S': 'Céline Dion'},
 'aggressive': {'N': '0.00000274005969914'},
 'happy': {'N': '0.668238401413'},
 'party': {'N': '0.167581662536'},
 'preview': {'S': 'https://p.scdn.co/mp3-preview/9a5c7bfbce322406f608513f803a223fe9286d81?cid=05d3a336ed33492e9e4cca1d5d7a48bd'},
 'genre': {'S': 'pop'},
 'name': {'S': 'Another Year Has Gone By'},
 'relaxed': {'N': '0.964192867279'},
 'mbid': {'S': '6a923265-38b1-4535-946a-7fc6aa79359c'},
 'album': {'S': 'These Are Special Times'},
 'popularity': {'N': '40'},
 'acoustic': {'N': '0.618483126163'},
 'electronic': {'N': '0.0580986514688'},
 'subgenres': {'L': [{'S': 'pop---ballad'}]},
 'sad': {'N': '0.767486810684'},
 'PK': {'S': '014An8mZt6hTR6bsN9fo2u'},
 'type': {'S': 'train'}}

In [16]:
list(labels.head(100)['tracks'].iloc[17]['artist'].values())[0]

'Portishead'

In [17]:
files = glob("../data/ludwig_music_data/mp3/latin/*.mp3")
ids = [i.split('/')[-1].replace(".mp3", '') for i in files]
music_df = pd.DataFrame(zip(ids, files), columns=["ids", 'files'])
music_df.head()

Unnamed: 0,ids,files
0,2PaETSKl3w3IdtLIbDnQXJ,../data/ludwig_music_data/mp3/latin/2PaETSKl3w...
1,3Cu37dl54yhg2ZPrEnTx0O,../data/ludwig_music_data/mp3/latin/3Cu37dl54y...
2,4RTRzqkcvvkvuMK5IpFLmS,../data/ludwig_music_data/mp3/latin/4RTRzqkcvv...
3,5A32KQZznC2HSqr9qzTl2N,../data/ludwig_music_data/mp3/latin/5A32KQZznC...
4,2uPQvR5WBOI22Wj2gwwiT5,../data/ludwig_music_data/mp3/latin/2uPQvR5WBO...


In [18]:
def get_metadata(x):
    try:
        artist = list(x['artist'].values())[0]
        genre = list(x['genre'].values())[0]
        name = list(x['name'].values())[0]
    except:
        artist = "Unknown"
        genre = "Unknown"
        name = "Unknown"
    return pd.Series([artist, genre, name], index=['artist', 'genre', 'name'])

In [20]:
clean_labels = labels['tracks'].apply(get_metadata).reset_index()
clean_labels.head()

Unnamed: 0,index,artist,genre,name
0,000QWvZpHrBIVrW4dGbaVI,047,electronic,General Error
1,0010BnyFuw94XFautS2uJp,Jimmy Buffett,latin,La Vie Dansante
2,0055LRFB7zfdCXDGodyIz3,New Order,rock,Doubts Even Here
3,005Dlt8Xaz3DkaXiRJgdiS,Ricardo Arjona,rock,Historia de Taxi
4,006RpKEKItNO4q8TkAUpOv,Worrytrain,electronic,They Will Make My Passage Easy


In [21]:
clean_labels.name.value_counts()

Unknown                       801
Intro                          42
You                            16
Hold On                        14
Home                           13
                             ... 
Everywhere That I'm Not         1
You Can Close Your Eyes         1
I Made It Through the Rain      1
The Green Maid                  1
Anytime, Anyday, Anywhere       1
Name: name, Length: 31303, dtype: int64

In [31]:
music_data1 = music_data.to_pandas().merge(right=clean_labels, left_on='idx', right_on='index', how="left").drop("index", axis=1)
music_data1.head()

Unnamed: 0,audio,label,idx,artist,genre,name
0,{'bytes': b'RIFF$\\(\x00WAVEfmt \x10\x00\x00\x...,4,2FFXB4QJmzuwtRxj2G5IUP,Umse,hip hop,Menschen
1,{'bytes': b'RIFF$\\(\x00WAVEfmt \x10\x00\x00\x...,9,3tPJiyCky7ILbEc3LezFkT,Nick Cave & The Bad Seeds,rock,Stagger Lee
2,{'bytes': b'RIFF$\\(\x00WAVEfmt \x10\x00\x00\x...,0,7JFVYtNn72Kg1mSs5foYKv,Be Good Tanyas,blues,The Littlest Birds
3,{'bytes': b'RIFF$\\(\x00WAVEfmt \x10\x00\x00\x...,3,6uSuDZyZminauHXtQNMFen,Marla Glen,funk / soul,The Cost of Freedom
4,{'bytes': b'RIFF$\\(\x00WAVEfmt \x10\x00\x00\x...,9,34w8Wslcni7dXe4SfVX8rK,Dethklok,rock,The Lost Vikings


In [24]:
from datasets import Dataset

In [32]:
music_data2 = Dataset.from_pandas(music_data1).remove_columns("__index_level_0__", )
music_data2

Dataset({
    features: ['audio', 'label', 'idx', 'artist', 'genre', 'name'],
    num_rows: 200
})

In [33]:
music_data3 = music_data2.cast_column('audio', Audio(sampling_rate=16000))

In [34]:
music_data3[0]

{'audio': {'path': None,
  'array': array([-1.87032768e-07, -4.16136459e-08,  4.78425022e-07, ...,
          1.23822384e-01,  2.40178227e-01,  0.00000000e+00]),
  'sampling_rate': 16000},
 'label': 4,
 'idx': '2FFXB4QJmzuwtRxj2G5IUP',
 'artist': 'Umse',
 'genre': 'hip hop',
 'name': 'Menschen'}

In [35]:
music_data3.save_to_disk('../data/sample')

Saving the dataset (0/2 shards):   0%|          | 0/200 [00:00<?, ? examples/s]