# Split MedleyDB 

This notebook was created to split the MedleyDB dataset into train and test sets using only the multitracks containing vocal melodies.

I used medleydb api to manipulate the files and build the subsets. 
The principal dependency is:

 - MedleyDB api: [https://github.com/marl/medleydb]

In [1]:
import medleydb as mdb

# Load all multitracks
mtrack_generator = mdb.load_all_multitracks()

all_tracks_id = [mtrack.track_id for mtrack in mtrack_generator]



In [2]:
# get all valid instrument labels
instruments = mdb.get_valid_instrument_labels()
print (instruments)

{'gong', 'conga', 'trombone section', 'bass drum', 'recorder', 'violin section', 'brass section', 'banjo', 'electric bass', 'vocalists', 'trumpet', 'dulcimer', 'electronic organ', 'male rapper', 'chimes', 'harmonica', 'pipe organ', 'piano', 'male screamer', 'male singer', 'drum machine', 'clean electric guitar', 'maracas', 'bass clarinet', 'acoustic guitar', 'baritone saxophone', 'crowd', 'panpipes', 'bagpipe', 'claps', 'triangle', 'french horn', 'oboe', 'Main System', 'sleigh bells', 'erhu', 'beatboxing', 'female screamer', 'tuba', 'auxiliary percussion', 'cornet', 'bandoneon', 'darbuka', 'dilruba', 'kick drum', 'rattle', 'gu', 'male speaker', 'tack piano', 'theremin', 'melodica', 'harp', 'harmonium', 'zhongruan', 'flute', 'trombone', 'viola', 'female singer', 'cabasa', 'accordion', 'alto saxophone', 'bassoon', 'liuqin', 'harpsichord', 'ukulele', 'tabla', 'slide guitar', 'synthesizer', 'guiro', 'snare drum', 'tenor saxophone', 'double bass', 'piccolo', 'xylophone', 'euphonium', 'whist

In [3]:
mtrack1 = mdb.MultiTrack('LizNelson_Rainfall')
print (mtrack1.melody_stems()[0].instrument)

['female singer']


In [4]:
# A not clean and not beautiful way to find vocal music
print ('== List of musics with singing voice ==')
vocal_tracks_id = []
for music in all_tracks_id:
    mtrack = mdb.MultiTrack(music)
    stems = [melodics.instrument for melodics in mtrack.melody_stems()]
    search_for = ['female singer', 'male singer', 'vocalists', 'choir']
    inters = [list(filter(lambda x: x in search_for, sublist)) for sublist in stems]
    #print (inters)
    has = [element for element in inters if element != []]
    if len(has) > 0:
        vocal_tracks_id.append(music)
        print (music)

== List of musics with singing voice ==
AClassicEducation_NightOwl
AimeeNorwich_Child
AlexanderRoss_GoodbyeBolero
AlexanderRoss_VelvetCurtain
Auctioneer_OurFutureFaces
AvaLuna_Waterduct
BigTroubles_Phantom
BrandonWebster_DontHearAThing
BrandonWebster_YesSirICanFly
CelestialShore_DieForUs
ClaraBerryAndWooldog_AirTraffic
ClaraBerryAndWooldog_Boys
ClaraBerryAndWooldog_Stella
ClaraBerryAndWooldog_TheBadGuys
ClaraBerryAndWooldog_WaltzForMyVictims
Creepoid_OldTree
Debussy_LenfantProdigue
DreamersOfTheGhetto_HeavyLove
FacesOnFilm_WaitingForGa
FamilyBand_Again
Handel_TornamiAVagheggiar
HeladoNegro_MitadDelMundo
HezekiahJones_BorrowedHeart
HopAlong_SisterCities
InvisibleFamiliars_DisturbingWildlife
LizNelson_Coldwar
LizNelson_ImComingHome
LizNelson_Rainfall
MatthewEntwistle_DontYouEver
MatthewEntwistle_Lontano
Meaxic_TakeAStep
Meaxic_YouListen
Mozart_BesterJungling
Mozart_DiesBildnis
MusicDelta_80sRock
MusicDelta_Beatles
MusicDelta_Britpop
MusicDelta_Country1
MusicDelta_Country2
MusicDelta_Disc

In [5]:
print ("MedleyDB has", len(all_tracks_id), "multitracks files,", len(vocal_tracks_id), "have singing voice.")

MedleyDB has 122 multitracks files, 61 have singing voice.



### Split into train and test sets

In [7]:
# Choose the number of splits
splits = 10

In [8]:
vocal_split = mdb.utils.artist_conditional_split(trackid_list=vocal_tracks_id, test_size=0.20, \
                                                 num_splits=splits,random_state=8526325)

In [9]:
print(vocal_split[0]['train'], "\nThere are", len(vocal_split[0]['train']), "songs on train set")

['AClassicEducation_NightOwl', 'AimeeNorwich_Child', 'AlexanderRoss_GoodbyeBolero', 'AlexanderRoss_VelvetCurtain', 'Auctioneer_OurFutureFaces', 'AvaLuna_Waterduct', 'BigTroubles_Phantom', 'BrandonWebster_DontHearAThing', 'BrandonWebster_YesSirICanFly', 'ClaraBerryAndWooldog_AirTraffic', 'ClaraBerryAndWooldog_Boys', 'ClaraBerryAndWooldog_Stella', 'ClaraBerryAndWooldog_TheBadGuys', 'ClaraBerryAndWooldog_WaltzForMyVictims', 'Creepoid_OldTree', 'Debussy_LenfantProdigue', 'DreamersOfTheGhetto_HeavyLove', 'FacesOnFilm_WaitingForGa', 'FamilyBand_Again', 'Handel_TornamiAVagheggiar', 'HeladoNegro_MitadDelMundo', 'HezekiahJones_BorrowedHeart', 'HopAlong_SisterCities', 'LizNelson_Coldwar', 'LizNelson_ImComingHome', 'LizNelson_Rainfall', 'MatthewEntwistle_DontYouEver', 'MatthewEntwistle_Lontano', 'Meaxic_TakeAStep', 'Meaxic_YouListen', 'Mozart_BesterJungling', 'Mozart_DiesBildnis', 'MusicDelta_80sRock', 'MusicDelta_Beatles', 'MusicDelta_Britpop', 'MusicDelta_Disco', 'MusicDelta_Grunge', 'MusicDelt

In [10]:
print(vocal_split[0]['test'], "\nThere are", len(vocal_split[0]['test']), "songs on test set")

['CelestialShore_DieForUs', 'InvisibleFamiliars_DisturbingWildlife', 'MusicDelta_Country1', 'MusicDelta_Country2', 'MusicDelta_Gospel', 'MusicDelta_Rock', 'PortStWillow_StayEven', 'Snowmine_Curfews', 'StrandOfOaks_Spacestation', 'SweetLights_YouLetMeDown'] 
There are 10 songs on test set


In [16]:
print(vocal_split[3])

{'test': ['AimeeNorwich_Child', 'ClaraBerryAndWooldog_AirTraffic', 'ClaraBerryAndWooldog_Boys', 'ClaraBerryAndWooldog_Stella', 'ClaraBerryAndWooldog_TheBadGuys', 'ClaraBerryAndWooldog_WaltzForMyVictims', 'Debussy_LenfantProdigue', 'FamilyBand_Again', 'HopAlong_SisterCities', 'LizNelson_ImComingHome', 'Mozart_DiesBildnis', 'MusicDelta_Punk', 'PortStWillow_StayEven', 'Schubert_Erstarrung', 'SecretMountains_HighHorse', 'StrandOfOaks_Spacestation'], 'train': ['AClassicEducation_NightOwl', 'AlexanderRoss_GoodbyeBolero', 'AlexanderRoss_VelvetCurtain', 'Auctioneer_OurFutureFaces', 'AvaLuna_Waterduct', 'BigTroubles_Phantom', 'BrandonWebster_DontHearAThing', 'BrandonWebster_YesSirICanFly', 'CelestialShore_DieForUs', 'Creepoid_OldTree', 'DreamersOfTheGhetto_HeavyLove', 'FacesOnFilm_WaitingForGa', 'Handel_TornamiAVagheggiar', 'HeladoNegro_MitadDelMundo', 'HezekiahJones_BorrowedHeart', 'InvisibleFamiliars_DisturbingWildlife', 'LizNelson_Coldwar', 'LizNelson_Rainfall', 'MatthewEntwistle_DontYouEver

### Split train into train/validation set

In [18]:
for spl in range(splits):

    vocal_train_split = mdb.utils.artist_conditional_split(trackid_list=vocal_split[spl]['train'], test_size=0.20, \
                                                       num_splits=1,random_state=8526325)

    print ("There are", len(vocal_train_split[0]['train']), "songs on train set and",\
           len(vocal_train_split[0]['test']), "songs on validation set")
    
    vocal_split[spl]['train'] = vocal_train_split[0].pop('train')
    
    vocal_split[spl]['validation'] = vocal_train_split[0].pop('test')

    #vocal_split[spl]['test'] = vocal_split[spl]['test']

There are 38 songs on train set and 13 songs on validation set
There are 33 songs on train set and 10 songs on validation set
There are 40 songs on train set and 11 songs on validation set
There are 33 songs on train set and 12 songs on validation set
There are 37 songs on train set and 13 songs on validation set
There are 39 songs on train set and 10 songs on validation set
There are 34 songs on train set and 13 songs on validation set
There are 40 songs on train set and 11 songs on validation set
There are 36 songs on train set and 12 songs on validation set
There are 34 songs on train set and 12 songs on validation set


In [19]:
import json
with open('split_voiced_medleydb.json', 'w') as outfile:
    json.dump(vocal_split, outfile)