In [1]:
import os
import sys
from collections import Counter
from time import sleep
import subprocess
import logging
import pandas as pd
import multiprocessing as mp

In [2]:
ONTOLOGY_PATH = "/Volumes/LittleJim/Datasets/AudioSet/ontology.json"
DATASET_PATH = "/Volumes/LittleJim/Datasets/AudioSet/unbalanced_train_segments.csv"
DOWNLOAD_DIR = "/Volumes/LittleJim/Datasets/youtube/"

In [3]:
ontology = pd.read_json(ONTOLOGY_PATH)

In [4]:
ontology = ontology.set_index('id')

In [5]:
genres = ontology[ontology.name.str.contains("genre")].child_ids.iloc[0]
moods = ontology[ontology.name.str.contains("mood")].child_ids.iloc[0]
instruments = ontology[ontology.name.str.contains("Musical instrument")].child_ids.iloc[0]
roles = ontology[ontology.name.str.contains("Music role")].child_ids.iloc[0]

In [6]:
genre_id = ontology[ontology.name.str.contains("genre")].index[0]
mood_id = ontology[ontology.name.str.contains("mood")].index[0]
instrument_id = ontology[ontology.name.str.contains("Musical instrument")].index[0]
role_id = ontology[ontology.name.str.contains("Music role")].index[0]

In [7]:
def ontology_leaves(root_id):
    children = []
    def dfs(current):
        child_ids = ontology.loc[current, "child_ids"]
        if not child_ids:
            children.append(current)
        else:
            for child in child_ids:
                dfs(child)
    dfs(root_id)
    return children

In [8]:
all_genres = ontology_leaves(genre_id)
all_moods = ontology_leaves(mood_id)
all_instruments = ontology_leaves(instrument_id)
all_roles = ontology_leaves(role_id)

In [9]:
genres_df = ontology[ontology.index.isin(all_genres)]
moods_df = ontology[ontology.index.isin(all_moods)]
instruments_df = ontology[ontology.index.isin(all_instruments)]
roles_df = ontology[ontology.index.isin(all_roles)]

In [10]:
for category in [genres_df, moods_df, instruments_df, roles_df]:
    print(category.name.values)

['Mantra' 'Pop music' 'Grime music' 'Trap music' 'Beatboxing'
 'Heavy metal' 'Punk rock' 'Grunge' 'Progressive rock' 'Rock and roll'
 'Psychedelic rock' 'Rhythm and blues' 'Soul music' 'Dub' 'Swing music'
 'Bluegrass' 'Funk' 'Folk music' 'Middle Eastern music' 'Jazz' 'Disco'
 'Opera' 'House music' 'Techno' 'Dubstep' 'Electro' 'Oldschool jungle'
 'Electronica' 'Electronic dance music' 'Drone music' 'Trance music'
 'Noise music' 'UK garage' 'Cumbia' 'Salsa music' 'Soca music' 'Kuduro'
 'Funk carioca' 'Flamenco' 'Blues' 'Music for children' 'New-age music'
 'A capella' 'Afrobeat' 'Kwaito' 'Gospel music' 'Carnatic music'
 'Music of Bollywood' 'Ska' 'Traditional music' 'Independent music']
['Happy music' 'Funny music' 'Sad music' 'Tender music' 'Exciting music'
 'Angry music' 'Scary music']
['Choir' 'Cowbell' 'Electric guitar' 'Bass guitar' 'Acoustic guitar'
 'Steel guitar, slide guitar' 'Tapping (guitar technique)' 'Strum' 'Banjo'
 'Sitar' 'Mandolin' 'Zither' 'Ukulele' 'Clavinet' 'Rhodes p

In [11]:
def split_and_set(string):
    return set(string.split(","))

def exactly_one_match(query, keys):
    """ Returns a matching key value iff exactly one of the query set is in the keys, otherwise pd.NA """
    matches = query.intersection(keys)
    return matches.pop() if len(matches) == 1 else pd.NA

In [12]:
dataset = pd.read_csv(DATASET_PATH, header=2, sep=",", skipinitialspace=True, quotechar='"')

In [13]:
dataset.positive_labels = dataset.positive_labels.map(split_and_set)

In [14]:
genres_set, moods_set, instruments_set, roles_set = map(lambda df: set(df.index), (genres_df, moods_df, instruments_df, roles_df))

In [15]:
def conditional_dataset(query_set):
    return dataset[~dataset.positive_labels.map(lambda lab: exactly_one_match(query_set, lab)).isna()]

In [16]:
genres_dataset, moods_dataset, instruments_dataset, roles_dataset = map(conditional_dataset, (genres_set, moods_set, instruments_set, roles_set))

In [17]:
instruments_dataset = instruments_dataset.sample(frac=1)

In [18]:
def download(url, filename, start, end):
    command = [
        "youtube-dl",
        "--no-overwrites",
        "-x",
        "--postprocessor-args",
        f'"-ss {start} -to {end}"',
        "-o",
        filename,
        f'"{url}"',
    ]
    to_run = " ".join(command)
    backoff = 0.5
    while True:
        stdout = !$to_run
        if any(["429:" in line for line in stdout]):
            print(stdout)
            sleep(backoff)
            backoff *= 2
        else:
            break
    return stdout

In [19]:
def write_batch_for_youtubedl(dataset_df):
    urls = list("https://www.youtube.com/watch?v=" + dataset_df["# YTID"])
    filenames = list(DOWNLOAD_DIR + dataset_df["# YTID"] + ".mp3")
    start = list(dataset_df["start_seconds"])
    end = list(dataset_df["end_seconds"])
    return zip(urls, filenames, start, end)

In [20]:
count = Counter(['done'])

In [21]:
def download_if_not_exists(url, filename, start, end):
    if count['done'] % 200 == 0:
        print(count['done'])
    if any(
        (os.path.exists(filename),
        os.path.exists(filename[:-3] + "opus"),
        os.path.exists(filename[:-3] + "m4a"),)
    ):
        count['done'] += 1
        return
    else:
        download(url, filename, start, end)
        count['done'] += 1
        return

In [22]:
list(map(lambda args: download_if_not_exists(*args), write_batch_for_youtubedl(instruments_dataset)));

200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800
9000
9200
9400
9600
9800
10000
10200
10400
10600
10800
11000
11200
11400
11600
11800
12000
12200
12400
12600
12800
13000
13200
13400
13600
13800
14000
14200
14400
14600
14800
15000
15200
15400
15600
15800
16000
16200
16400
16600
16800
17000
17200
17400
17600
17800
18000
18200
18400
18600
18800
19000
19200
19400
19600
19800
20000
20200
20400
20600
20800
21000
21200
21400
21600
21800
22000
22200
22400
22600
22800
23000
23200
23400
23600
23800
24000
24200
24400
24600
24800
25000
25200
25400
25600
25800
26000
26200
26400
26600
26800
27000
27200
27400
27600
27800
28000
28200
28400
28600
28800
29000
29200
29400
29600
29800
30000
30200
30400
30600
30800
31000
31200
31400
31600
31800
32000
32200
32400
32600
32800
33000
33200
33400
33600
33800
34000
34200
34400
34600
34800
35000
352

In [149]:
write_batch_for_youtubedl(moods_dataset, 'moods', '.')

In [32]:
ontology[ontology.id.isin(moods)]

Unnamed: 0,id,name,description,citation_uri,positive_examples,child_ids,restrictions
334,/t/dd00031,Happy music,Music that evokes or conveys feelings of happi...,,[],[],[]
335,/t/dd00032,Funny music,Music that evokes or conveys amusement.,,[],[],[]
336,/t/dd00033,Sad music,Music that evokes or conveys feelings of sadness.,,[],[],[]
337,/t/dd00034,Tender music,Music that evokes or conveys feelings of tende...,,[],[],[]
338,/t/dd00035,Exciting music,Music that evokes or conveys feelings of excit...,,[],[],[]
339,/t/dd00036,Angry music,Music that evokes or conveys feelings of anger.,,[],[],[]
340,/t/dd00037,Scary music,Music that evokes or conveys feelings of fear.,,[],[],[]


In [39]:
ontology[ontology.id.isin(ontology[ontology.id.isin(instruments)].iloc[2, -2])]

Unnamed: 0,id,name,description,citation_uri,positive_examples,child_ids,restrictions
172,/m/05r5c,Piano,Sounds of a musical instrument played via a ro...,http://en.wikipedia.org/wiki/Piano,"[youtu.be/jETQAE_vBQI?start=30&end=40, youtu.b...",[/m/01s0ps],[]
176,/m/013y1f,Organ,Sounds of a keyboard instrument of one or more...,http://en.wikipedia.org/wiki/Organ_(music),"[youtu.be/ddoz52PLnnM?start=480&end=490, youtu...","[/m/03xq_f, /m/03gvt]",[]
179,/m/0l14qv,Synthesizer,Sounds associated with an electronic musical i...,http://en.wikipedia.org/wiki/Synthesizer,"[youtu.be/DQ5KwZ2qBMY?start=390&end=400, youtu...","[/m/01v1d8, /m/0gkd1]",[]
182,/m/03q5t,Harpsichord,Sounds of a keyboard-controlled instrument in ...,http://en.wikipedia.org/wiki/Harpsichord,"[youtu.be/3QyBsEwyI4g?start=230&end=240, youtu...",[],[]


In [34]:
ontology[ontology.id.isin(roles)]

Unnamed: 0,id,name,description,citation_uri,positive_examples,child_ids,restrictions
323,/m/025td0t,Background music,Styles of music or soundscapes primarily inten...,http://en.wikipedia.org/wiki/Background_music,[],[],[]
324,/m/02cjck,Theme music,Music often written specifically for a radio p...,http://en.wikipedia.org/wiki/Theme_music,[],[],[]
325,/m/03r5q_,Jingle (music),A short song or tune used in advertising and f...,http://en.wikipedia.org/wiki/Jingle,[],[],[]
326,/m/0l14gg,Soundtrack music,"Recorded music accompanying a movie, video, TV...",http://en.wikipedia.org/wiki/Soundtrack,[],[],[]
327,/m/07pkxdp,Lullaby,The act of singing a quiet song to lull a chil...,http://wordnetweb.princeton.edu/perl/webwn?s=l...,[],[],[]
328,/m/01z7dr,Video game music,The soundtrack that accompanies video games. E...,http://en.wikipedia.org/wiki/Video_game_music,[],[],[]
329,/m/0140xf,Christmas music,A variety of genres of music normally performe...,http://en.wikipedia.org/wiki/Christmas_music,[],[],[]
330,/m/0ggx5q,Dance music,Music composed specifically to facilitate or a...,http://en.wikipedia.org/wiki/Dance_music,[],[],[]
331,/m/04wptg,Wedding music,"Music played at wedding celebrations, includin...",http://en.wikipedia.org/wiki/Wedding_music,[],[],[]
332,/t/dd00029,Birthday music,Music specifically used or performed at birthd...,,[],[],[]
