# Split MedleyDB 

This notebook was created to split the MedleyDB dataset into train and test sets using only the multitracks containing vocal melodies.

I used medleydb api to manipulate the files and build the subsets. 
The principal dependency is:

 - MedleyDB api: [https://github.com/marl/medleydb]

In [1]:
import medleydb as mdb

# Load all multitracks
mtrack_generator = mdb.load_all_multitracks()

all_tracks_id = [mtrack.track_id for mtrack in mtrack_generator]



In [2]:
# get all valid instrument labels
instruments = mdb.get_valid_instrument_labels()
print (instruments)

{'cello section', 'bongo', 'liuqin', 'horn section', 'vibraphone', 'kick drum', 'dilruba', 'bamboo flute', 'double bass', 'claps', 'sitar', 'electric piano', 'french horn', 'alto saxophone', 'trumpet', 'darbuka', 'bassoon', 'doumbek', 'zhongruan', 'trombone', 'electronic organ', 'theremin', 'marimba', 'melodica', 'violin', 'tack piano', 'bass clarinet', 'glockenspiel', 'choir', 'female rapper', 'drum machine', 'cabasa', 'dulcimer', 'Main System', 'string section', 'yangqin', 'beatboxing', 'chimes', 'violin section', 'female speaker', 'tambourine', 'cornet', 'tuba', 'guzheng', 'auxiliary percussion', 'mandolin', 'euphonium', 'gu', 'guiro', 'slide guitar', 'fx/processed sound', 'bagpipe', 'electric bass', 'piccolo', 'synthesizer', 'shaker', 'lap steel guitar', 'maracas', 'xylophone', 'tenor saxophone', 'harmonium', 'ukulele', 'erhu', 'viola', 'conga', 'male screamer', 'pipe organ', 'Unlabeled', 'panpipes', 'harp', 'french horn section', 'bandoneon', 'cello', 'female screamer', 'bass drum

In [3]:
mtrack1 = mdb.MultiTrack('LizNelson_Rainfall')
print (mtrack1.melody_stems()[0].instrument)

['female singer']


In [4]:
# A not clean and not beautiful way to find vocal music
print ('== List of musics with singing voice ==')
vocal_tracks_id = []
for music in all_tracks_id:
    mtrack = mdb.MultiTrack(music)
    stems = [melodics.instrument for melodics in mtrack.melody_stems()]
    search_for = ['female singer', 'male singer', 'vocalists', 'choir']
    inters = [list(filter(lambda x: x in search_for, sublist)) for sublist in stems]
    #print (inters)
    has = [element for element in inters if element != []]
    if len(has) > 0:
        vocal_tracks_id.append(music)
        print (music)

== List of musics with singing voice ==
AClassicEducation_NightOwl
AimeeNorwich_Child
AlexanderRoss_GoodbyeBolero
AlexanderRoss_VelvetCurtain
Auctioneer_OurFutureFaces
AvaLuna_Waterduct
BigTroubles_Phantom
BrandonWebster_DontHearAThing
BrandonWebster_YesSirICanFly
CelestialShore_DieForUs
ClaraBerryAndWooldog_AirTraffic
ClaraBerryAndWooldog_Boys
ClaraBerryAndWooldog_Stella
ClaraBerryAndWooldog_TheBadGuys
ClaraBerryAndWooldog_WaltzForMyVictims
Creepoid_OldTree
Debussy_LenfantProdigue
DreamersOfTheGhetto_HeavyLove
FacesOnFilm_WaitingForGa
FamilyBand_Again
Handel_TornamiAVagheggiar
HeladoNegro_MitadDelMundo
HezekiahJones_BorrowedHeart
HopAlong_SisterCities
InvisibleFamiliars_DisturbingWildlife
LizNelson_Coldwar
LizNelson_ImComingHome
LizNelson_Rainfall
MatthewEntwistle_DontYouEver
MatthewEntwistle_Lontano
Meaxic_TakeAStep
Meaxic_YouListen
Mozart_BesterJungling
Mozart_DiesBildnis
MusicDelta_80sRock
MusicDelta_Beatles
MusicDelta_Britpop
MusicDelta_Country1
MusicDelta_Country2
MusicDelta_Disc

In [5]:
len(all_tracks_id)

122

In [6]:
len(vocal_tracks_id)

61

In [7]:
vocal_split = mdb.utils.artist_conditional_split(trackid_list=vocal_tracks_id, test_size=0.20, num_splits=1,random_state=8526325)

In [8]:
print(vocal_split[0]['train'])

['AClassicEducation_NightOwl', 'AimeeNorwich_Child', 'AlexanderRoss_GoodbyeBolero', 'AlexanderRoss_VelvetCurtain', 'Auctioneer_OurFutureFaces', 'AvaLuna_Waterduct', 'BigTroubles_Phantom', 'BrandonWebster_DontHearAThing', 'BrandonWebster_YesSirICanFly', 'ClaraBerryAndWooldog_AirTraffic', 'ClaraBerryAndWooldog_Boys', 'ClaraBerryAndWooldog_Stella', 'ClaraBerryAndWooldog_TheBadGuys', 'ClaraBerryAndWooldog_WaltzForMyVictims', 'Creepoid_OldTree', 'Debussy_LenfantProdigue', 'DreamersOfTheGhetto_HeavyLove', 'FacesOnFilm_WaitingForGa', 'FamilyBand_Again', 'Handel_TornamiAVagheggiar', 'HeladoNegro_MitadDelMundo', 'HezekiahJones_BorrowedHeart', 'HopAlong_SisterCities', 'LizNelson_Coldwar', 'LizNelson_ImComingHome', 'LizNelson_Rainfall', 'MatthewEntwistle_DontYouEver', 'MatthewEntwistle_Lontano', 'Meaxic_TakeAStep', 'Meaxic_YouListen', 'Mozart_BesterJungling', 'Mozart_DiesBildnis', 'MusicDelta_80sRock', 'MusicDelta_Beatles', 'MusicDelta_Britpop', 'MusicDelta_Disco', 'MusicDelta_Grunge', 'MusicDelt

In [9]:
print(vocal_split[0]['test'])

['CelestialShore_DieForUs', 'InvisibleFamiliars_DisturbingWildlife', 'MusicDelta_Country1', 'MusicDelta_Country2', 'MusicDelta_Gospel', 'MusicDelta_Rock', 'PortStWillow_StayEven', 'Snowmine_Curfews', 'StrandOfOaks_Spacestation', 'SweetLights_YouLetMeDown']


## Concatenate musics without singing voice with the subsets with singing voice.

In [10]:
li_dif = [i for i in all_tracks_id if i not in vocal_tracks_id]
li_dif

['AimeeNorwich_Flying',
 'AmarLal_Rest',
 'AmarLal_SpringDay1',
 'ChrisJacoby_BoothShotLincoln',
 'ChrisJacoby_PigsFoot',
 'CroqueMadame_Oil',
 'CroqueMadame_Pilot',
 'EthanHein_1930sSynthAndUprightBass',
 'EthanHein_BluesForNofi',
 'EthanHein_GirlOnABridge',
 'EthanHein_HarmonicaFigure',
 'Grants_PunchDrunk',
 'JoelHelander_Definition',
 'JoelHelander_ExcessiveResistancetoChange',
 'JoelHelander_IntheAtticBedroom',
 'KarimDouaidy_Hopscotch',
 'KarimDouaidy_Yatora',
 'Lushlife_ToynbeeSuite',
 'MatthewEntwistle_AnEveningWithOliver',
 'MatthewEntwistle_FairerHopes',
 'MatthewEntwistle_ImpressionsOfSaturn',
 'MatthewEntwistle_TheArch',
 'MatthewEntwistle_TheFlaxenField',
 'MichaelKropf_AllGoodThings',
 'MusicDelta_BebopJazz',
 'MusicDelta_Beethoven',
 'MusicDelta_ChineseChaoZhou',
 'MusicDelta_ChineseDrama',
 'MusicDelta_ChineseHenan',
 'MusicDelta_ChineseJiangNan',
 'MusicDelta_ChineseXinJing',
 'MusicDelta_ChineseYaoZu',
 'MusicDelta_CoolJazz',
 'MusicDelta_FreeJazz',
 'MusicDelta_FunkJ

In [11]:
instr_split = mdb.utils.artist_conditional_split(trackid_list=li_dif, test_size=0.20, num_splits=1,random_state=8526325)

In [12]:
# I got musics from AmarLal to MusicDelta to complete the dataset for test
len(instr_split[0]['test'])

15

In [13]:
# I got music from Aimee to Matthew to complete train set
len(instr_split[0]['train'])

46

### Split train into train/validation set

In [14]:
vocal_train_split = mdb.utils.artist_conditional_split(trackid_list=vocal_split[0]['train'], test_size=0.20, num_splits=1,random_state=8526325)

In [15]:
instr_train_split = mdb.utils.artist_conditional_split(trackid_list=instr_split[0]['train'], test_size=0.20, num_splits=5,random_state=8526325)

In [16]:
len(vocal_train_split[0]['train']), len(vocal_train_split[0]['test'])

(38, 13)

In [17]:
vocal_train_split[0]['train'] = vocal_train_split[0]['train']+instr_train_split[0]['train']
vocal_train_split[0]['validation'] = vocal_train_split[0].pop('test')+instr_train_split[0]['test']
len(vocal_train_split[0]['train']), len(vocal_train_split[0]['validation']), vocal_train_split[0]

(68,
 29,
 {'train': ['AClassicEducation_NightOwl',
   'AlexanderRoss_GoodbyeBolero',
   'AlexanderRoss_VelvetCurtain',
   'Auctioneer_OurFutureFaces',
   'AvaLuna_Waterduct',
   'BigTroubles_Phantom',
   'BrandonWebster_DontHearAThing',
   'BrandonWebster_YesSirICanFly',
   'ClaraBerryAndWooldog_AirTraffic',
   'ClaraBerryAndWooldog_Boys',
   'ClaraBerryAndWooldog_Stella',
   'ClaraBerryAndWooldog_TheBadGuys',
   'ClaraBerryAndWooldog_WaltzForMyVictims',
   'Creepoid_OldTree',
   'Debussy_LenfantProdigue',
   'DreamersOfTheGhetto_HeavyLove',
   'FacesOnFilm_WaitingForGa',
   'FamilyBand_Again',
   'HeladoNegro_MitadDelMundo',
   'HezekiahJones_BorrowedHeart',
   'HopAlong_SisterCities',
   'LizNelson_ImComingHome',
   'MatthewEntwistle_DontYouEver',
   'MatthewEntwistle_Lontano',
   'Mozart_DiesBildnis',
   'MusicDelta_80sRock',
   'MusicDelta_Beatles',
   'MusicDelta_Disco',
   'MusicDelta_Grunge',
   'MusicDelta_Hendrix',
   'MusicDelta_Punk',
   'MusicDelta_Reggae',
   'MusicDelta_

In [18]:
vocal_train_split[0]['test'] = vocal_split[0]['test']+instr_split[0]['test']
len(vocal_train_split[0]['train']), len(vocal_train_split[0]['validation']), len(vocal_train_split[0]['test']), vocal_train_split[0]

(68,
 29,
 25,
 {'test': ['CelestialShore_DieForUs',
   'InvisibleFamiliars_DisturbingWildlife',
   'MusicDelta_Country1',
   'MusicDelta_Country2',
   'MusicDelta_Gospel',
   'MusicDelta_Rock',
   'PortStWillow_StayEven',
   'Snowmine_Curfews',
   'StrandOfOaks_Spacestation',
   'SweetLights_YouLetMeDown',
   'AmarLal_Rest',
   'AmarLal_SpringDay1',
   'Grants_PunchDrunk',
   'Lushlife_ToynbeeSuite',
   'MusicDelta_GriegTrolltog',
   'MusicDelta_SwingJazz',
   'TablaBreakbeatScience_Animoog',
   'TablaBreakbeatScience_CaptainSky',
   'TablaBreakbeatScience_MiloVsMongo',
   'TablaBreakbeatScience_MoodyPlucks',
   'TablaBreakbeatScience_PhaseTransition',
   'TablaBreakbeatScience_RockSteady',
   'TablaBreakbeatScience_Scorpio',
   'TablaBreakbeatScience_Vger',
   'TablaBreakbeatScience_WhoIsIt'],
  'train': ['AClassicEducation_NightOwl',
   'AlexanderRoss_GoodbyeBolero',
   'AlexanderRoss_VelvetCurtain',
   'Auctioneer_OurFutureFaces',
   'AvaLuna_Waterduct',
   'BigTroubles_Phantom',
 

In [19]:
import json
with open('split_all_medleydb.json', 'w') as outfile:
    json.dump(vocal_train_split[0], outfile)