In [1]:
import os
from tqdm.notebook import tqdm
from deepmusic import MusicRepr, Constants
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import pickle

In [2]:
const = Constants(unit=4, num_tempo_bins=20, num_velocity_bins=20)
path = '/home/soroosh/data/MIDI/lmd_processed/'
files = list(filter(lambda x: x.endswith('.mid'), os.listdir(path)))
len(files)

22945

In [3]:
def process_file(file):
    seq = MusicRepr.from_file(path + file, const=const)
    tracks = seq.separate_tracks()
    ent = {'file' : file, 'n_bar': seq.get_bar_count()}
    for inst in const.instruments:
        if inst in tracks:
            ent[inst] = len(tracks[inst])
        else:
            ent[inst] = 0
    return ent
    
res = Parallel(n_jobs=20)(delayed(process_file)(file) for file in tqdm(files))

  0%|          | 0/22945 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [15]:
df = pd.DataFrame(res)
df.head()

Unnamed: 0,file,n_bar,piano,percussion,organ,guitar,bass,strings,ensemble,brass,reed,pipe,synth-lead,synth-pad,synth-effects,ethnic,percussive,sound-effects,drums
0,db3e9dc1cfe4bd92a83fccba6e71ce7c.mid,52,146,0,0,1194,319,0,509,0,0,1029,0,0,0,0,0,0,1189
1,98b6a94a047276d573621a1e772a131d.mid,147,0,0,0,3516,2137,0,0,0,0,748,2505,0,0,0,0,0,2834
2,2b56bdbe74fd736f4ba1ebd0fe39b3fc.mid,35,0,0,0,0,278,1117,289,0,0,0,0,0,0,0,0,0,1020
3,d8b5d21746970f85b4dda031715fcbb4.mid,110,1803,0,0,472,784,0,0,0,0,0,0,0,0,0,0,0,2457
4,dc460123023c30c77e08ee65e0021b5e.mid,87,858,293,0,1397,449,0,0,0,0,0,0,0,0,0,0,0,2087


In [26]:
df = df.sort_values('n_bar').reset_index(drop=True)
df.head()

Unnamed: 0,file,n_bar,piano,percussion,organ,guitar,bass,strings,ensemble,brass,reed,pipe,synth-lead,synth-pad,synth-effects,ethnic,percussive,sound-effects,drums
0,845142e563abf25fc6e2bbd14243657b.mid,7,461,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,cc9046a014b743c01ea86394b6e106d3.mid,8,178,0,0,0,52,0,58,0,52,0,0,0,0,0,0,0,168
2,ad6b632b41263603dd46b2d609e898ca.mid,8,83,0,0,127,0,0,29,0,0,72,0,0,0,0,15,0,278
3,c4ae8c97333e806dda55b92a18b75024.mid,9,80,0,0,0,55,0,51,0,0,0,0,0,101,0,0,0,209
4,367d0f4718ba4e3dd28c5674bcc78dfc.mid,10,538,0,0,0,261,0,0,0,0,0,0,0,0,0,0,0,398


In [27]:
df.to_csv('lmd_data.csv', index=False)

In [31]:
df_filtered = df[(df.piano > 0) & (df.guitar > 0) & (df.drums > 0) & (df.bass > 0)]
df_filtered.shape

(10808, 19)

In [37]:
files = df_filtered.file.to_list()
pickle.dump(files, open('files.pkl', 'wb'))