In [11]:
import glob
import os
import librosa
import numpy as np
import pretty_midi

In [12]:
# Constants

MIDInotes_Range = [librosa.note_to_midi('A0'), librosa.note_to_midi('C8')]
Sample_Rate = 16000
Bins_per_octave = 36
n_octaves = 7
Hop_length = 512
spec_fmin = librosa.note_to_hz('A0')
val_rate = 1 / 7
n_Bins = n_octaves * Bins_per_octave
win_width = 32
kernel_size = 7
overlap = True

'''''''''
Allow pretty_midi to read MIDI files with absurdly high tick rates.
Useful for reading the MAPS dataset.
https://github.com/craffel/pretty-midi/issues/112
'''''''''

pretty_midi.pretty_midi.MAX_TICK = 1e10

In [13]:
def MIDI_to_mat(MIDI_path_train, length, len_of_CQT, Range_MIDI=MIDInotes_Range):
    MIDI_data = pretty_midi.PrettyMIDI(MIDI_path_train)
    pianoRoll = MIDI_data.instruments[0].get_piano_roll(fs=len_of_CQT * Sample_Rate / length)
    Ground_truth_Mat = (pianoRoll[Range_MIDI[0]:Range_MIDI[1] + 1, :len_of_CQT] > 0)
    return Ground_truth_Mat

In [19]:
dataset = ['Valid', 'Test', 'Train']

for data in dataset:

    S_Path = os.path.join('/Users/ronit/Documents/AIML_Project/DataSet', data)
    D_path = os.path.join('/Users/ronit/Documents/AIML_Project/DataSet_P', data)

    if not os.path.exists(D_path):
        os.makedirs(D_path)

    print(S_Path)

    WAVS = [file[:-4] for file in os.listdir(S_Path) if file.endswith('.wav')]
    MIDI = [file[:-4] for file in os.listdir(S_Path) if file.endswith('.mid')]

    pairs = []
    for file in WAVS:
        assert file in MIDI, 'Both the MIDI and WAV files not found'
        pairs.append((file + '.wav', file + '.mid'))
    
    cnt = 0
    for wav, midi in pairs:
        wav_path = os.path.join(S_Path, wav)
        x, sr = librosa.load(wav_path, sr=Sample_Rate)

        # Load the audio from specific path and compute Constant-Q Transform.
        CQT_file = librosa.cqt(x,
                               sr=Sample_Rate,
                               fmin=spec_fmin,
                               hop_length=Hop_length,
                               n_bins=n_Bins,
                               bins_per_octave=Bins_per_octave,
                               scale=False
                               )
        # Taking abs of Constant-Q Transform of given audio as 2D NumPy array
        CQT_abs = np.abs(CQT_file)
        # Converting the frequency data on a logarthmic scale for better visualization
        CQT = np.transpose(librosa.amplitude_to_db(CQT_abs))

        midi_path = os.path.join(S_Path, midi)
        Ground_Truth_mat = MIDI_to_mat(midi_path, len(x), CQT.shape[0])
        MIDI_Train = np.transpose(Ground_Truth_mat)

        # Length of MIDI < length of CQT, then cut CQT
        if MIDI_Train.shape[0] < CQT.shape[0]:
            CQT = CQT[:MIDI_Train.shape[0],:]
        # --> TODO : ADD one more condition


        # --> Processing WAV files
        W_temp_path = os.path.join(D_path, wav, 'WAV')
        if not os.path.exists(W_temp_path):
            os.makedirs(W_temp_path)
            
        W_path = os.path.join(W_temp_path, wav[:-4]) + '_CQT.npy'
        np.save(W_path, CQT)
        
        CQT_matrix = np.array(np.load(W_path))
        len_CQT = CQT_matrix.shape[0]
        nb_win = int(len_CQT / win_width)
        
        W = CQT_matrix.shape[1]
        # Adding Padding
        Mat = np.concatenate([np.zeros([int(kernel_size / 2), W]), CQT_matrix, np.zeros([int(kernel_size / 2), W])], axis = 0)
        cut_Mat = [Mat[i * win_width:(i + 1) * win_width+kernel_size-1,:] for i in range(nb_win)]
        
        CQT_cut_Mat = np.asarray(cut_Mat)
        os.remove(W_path)
        
        X = CQT_cut_Mat if cnt == 0 else np.concatenate((X, CQT_cut_Mat), axis = 0)
        
        # --> Processing MIDI files   
        M_temp_path = os.path.join(D_path, midi, 'MIDI')
        if not os.path.exists(M_temp_path):
            os.makedirs(M_temp_path)
            
        M_path = os.path.join(M_temp_path, midi[:-4]) + '_LABEL.npy'
        np.save(M_path, MIDI_Train)
        
        MIDI_matrix = np.array(np.load(M_path))
        len_MIDI = MIDI_matrix.shape[0]
        nb_win_ = int(len_MIDI / win_width)
        
        M = MIDI_matrix.shape[1]
        # Adding Padding
        Mat_ = np.concatenate([np.zeros([int(kernel_size / 2), M]), MIDI_matrix, np.zeros([int(kernel_size / 2), M])], axis = 0)
        cut_Mat_ = [Mat_[i * win_width:(i + 1) * win_width+kernel_size-1,:] for i in range(nb_win_)]
        
        MIDI_cut_Mat = np.asarray(cut_Mat_)
        os.remove(M_path)

        Y = MIDI_cut_Mat if cnt == 0 else np.concatenate((Y, MIDI_cut_Mat), axis = 0)
        
        print('Processed', wav[:-4], "File No:", cnt)
        cnt += 1
        
        os.rmdir(W_temp_path)
        os.rmdir(M_temp_path)
        os.rmdir(os.path.join(D_path, wav))
        os.rmdir(os.path.join(D_path, midi))
    
    X_ = np.expand_dims(X, axis = -2)
    Y_ = np.expand_dims(Y, axis = -2)
    
    CQT_final_Path = os.path.join(D_path, 'X_final_') + 'CQT_.npy'
    MIDI_final_Path = os.path.join(D_path, 'Y_final_') + 'Label_.npy'
    np.save(CQT_final_Path, X_)
    np.save(MIDI_final_Path, Y_)
    

/Users/ronit/Documents/AIML_Project/DataSet/Valid
Processed MAPS_MUS-mond_2_SptkBGCl File No: 0
Processed MAPS_MUS-mond_1_SptkBGAm File No: 1
Processed MAPS_MUS-chpn_op10_e05_SptkBGAm File No: 2
Processed MAPS_MUS-liz_et3_AkPnStgb File No: 3
Processed MAPS_MUS-mendel_op53_5_AkPnCGdD File No: 4
Processed MAPS_MUS-scn15_13_AkPnStgb File No: 5
Processed MAPS_MUS-chp_op18_AkPnCGdD File No: 6
Processed MAPS_MUS-grieg_walzer_SptkBGCl File No: 7
Processed MAPS_MUS-waldstein_3_SptkBGCl File No: 8
/Users/ronit/Documents/AIML_Project/DataSet/Test
Processed MAPS_MUS-schu_143_1_ENSTDkAm File No: 0
Processed MAPS_MUS-liz_rhap12_ENSTDkAm File No: 1
Processed MAPS_MUS-mz_333_3_ENSTDkCl File No: 2
Processed MAPS_MUS-schub_d760_3_ENSTDkAm File No: 3
Processed MAPS_MUS-scn15_11_ENSTDkAm File No: 4
Processed MAPS_MUS-ty_maerz_ENSTDkAm File No: 5
Processed MAPS_MUS-scn15_12_ENSTDkCl File No: 6
Processed MAPS_MUS-grieg_butterfly_ENSTDkCl File No: 7
Processed MAPS_MUS-mz_332_2_ENSTDkCl File No: 8
Processed 

In [22]:
X_data_Valid = np.load('DataSet_P/Valid/X_final_CQT_.npy')
X_data_Valid.shape

(1998, 38, 1, 252)

In [23]:
Y_data_Valid = np.load('DataSet_P/Valid/Y_final_Label_.npy')
Y_data_Valid.shape

(1998, 38, 1, 88)

In [24]:
X_data_Test = np.load('DataSet_P/Test/X_final_CQT_.npy')
X_data_Test.shape

(15219, 38, 1, 252)

In [25]:
Y_data_Test = np.load('DataSet_P/Test/Y_final_Label_.npy')
Y_data_Test.shape

(15219, 38, 1, 88)

In [27]:
X_data_Train = np.load('DataSet_P/Train/X_final_CQT_.npy')
X_data_Train.shape

(28721, 38, 1, 252)

In [26]:
Y_data_Train = np.load('DataSet_P/Train/Y_final_Label_.npy')
Y_data_Train.shape

(28721, 38, 1, 88)

In [30]:
a3 = np.load('DataSet_P/Train/X_final_CQT_.npy', mmap_mode='r')
a3

memmap([[[[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],

         [[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],

         [[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
            0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],

         ...,

         [[-2.24933319e+01, -2.33259106e+01, -2.11815224e+01, ...,
           -3.65054741e+01, -3.65054741e+01, -3.65054741e+01]],

         [[-2.20094490e+01, -2.35525684e+01, -2.09465504e+01, ...,
           -3.65054741e+01, -3.65054741e+01, -3.65054741e+01]],

         [[-2.15774345e+01, -2.38304596e+01, -2.07589703e+01, ...,
           -3.22830162e+01, -2.27744694e+01, -2.49282036e+01]]],


        [[[-2.39244061e+01, -2.33881092e+01, -2.23313541e+01, ...,
           -3.65054741e+01, -3.65054741e+01, -3.65054741e+01]],

         [[-2.34484310e+01, -2.32849808e+01, -2.19118881e+0

In [31]:
X_data_Train

array([[[[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],

        [[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],

        [[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
           0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],

        ...,

        [[-2.24933319e+01, -2.33259106e+01, -2.11815224e+01, ...,
          -3.65054741e+01, -3.65054741e+01, -3.65054741e+01]],

        [[-2.20094490e+01, -2.35525684e+01, -2.09465504e+01, ...,
          -3.65054741e+01, -3.65054741e+01, -3.65054741e+01]],

        [[-2.15774345e+01, -2.38304596e+01, -2.07589703e+01, ...,
          -3.22830162e+01, -2.27744694e+01, -2.49282036e+01]]],


       [[[-2.39244061e+01, -2.33881092e+01, -2.23313541e+01, ...,
          -3.65054741e+01, -3.65054741e+01, -3.65054741e+01]],

        [[-2.34484310e+01, -2.32849808e+01, -2.19118881e+01, ...,
        