In [25]:
import librosa
import numpy as np
import matplotlib.pyplot as plt

def compute_melgram(audio_path):
    """
    Compute a mel-spectrogram and return a np array of shape (96,1407), where 
    96 == #mel-bins and 1407 == #time frame
    """

    # Audio and mel-spectrogram parameters
    SR = 12000
    N_FFT = 512
    N_MELS = 96
    HOP_LEN = N_FFT / 2   # overlap 50%
    DUR = 20              # in seconds

    # Load audio and downsample
    src, orig_sr = librosa.load(audio_path, sr=None)  # whole signal at native sampling rate
    src = librosa.core.resample(src, orig_sr, SR)     # downsample down to SR
    
    # Adjust size if necessary. Vast, vast majority of mp3's are 30 seconds and should require little adjustment.
    n_sample = src.shape[0]
    n_sample_fit = int(DUR * SR)
    if n_sample < n_sample_fit:                       # if too short, pad with zeros
        src = np.hstack((src, np.zeros((int(DUR*SR) - n_sample,))))
    elif n_sample > n_sample_fit:                     # if too long, take middle section of length DURA seconds
        src = src[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2]
    
    # Compute log mel spectrogram 
    logam = librosa.logamplitude
    melgram = librosa.feature.melspectrogram
    ret = logam(melgram(y =src, sr=SR, hop_length=HOP_LEN,
                        n_fft=N_FFT, n_mels=N_MELS)**2,
                ref_power=1.0)
#     ret = ret[np.newaxis, np.newaxis, :]

#     assert ret.shape == (96, 1407)
    print ret.shape
    return ret

mel = compute_melgram('01GbSQ6e7OeneJULQ4sJT0.mp3')
# mel = compute_melgram('0NAxC72UOJWe369LHh2Ivq.mp3')

librosa.display.specshow(mel, y_axis='mel', fmax=8000, x_axis='time')
plt.colorbar(format='%+2.0f dB')
plt.title('Mel spectrogram')
plt.tight_layout()
plt.show()

(96, 938)
