-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
47 lines (42 loc) · 1.7 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import librosa
import numpy as np
from pathlib import Path
# common parameters
DATA_LOCATION = Path('datasets/SecklowSounds')
DOCS_LOCATION = DATA_LOCATION / 'docs'
CHUNKS_LOCATION = DOCS_LOCATION / 'chunks'
GOLD_LOCATION = DATA_LOCATION / 'gold.csv'
MELGRAM_LOCATION = DATA_LOCATION / 'melgrams'
# mel-spectrogram parameters
SR = 12000
N_FFT = 512
N_MELS = 96
HOP_LEN = 256
DURA = 29.12 # to make it 1366 frame..
N_FRAMES = 1366 # TODO discover where those numbers come from #int(DURA * SR)
# TODO see https://github.com/keras-team/keras/blame/5a48df22f0d9dd5b365ed5afc9923424d2e8c2c9/keras/layers/core.py#L76
# the shapes are ok float until the Dense layer
def compute_melgram(audio_path):
''' Compute a mel-spectrogram and returns it in a shape of (1,96,1366), where
96 == #mel-bins and 1366 == #time frame
parameters
----------
audio_path: path for the audio file.
Any format supported by audioread will work.
More info: http://librosa.github.io/librosa/generated/librosa.core.load.html#librosa.core.load
'''
src, sr = librosa.load(audio_path, sr=SR) # whole signal
n_sample = src.shape[0]
n_sample_fit = int(DURA*SR)
# TODO remove padding / truncating, real variable length
if n_sample < n_sample_fit: # if too short
src = np.hstack((src, np.zeros((int(DURA*SR) - n_sample,))))
elif n_sample > n_sample_fit: # if too long
src = src[(n_sample-n_sample_fit)//2:(n_sample+n_sample_fit)//2]
logam = librosa.power_to_db
melgram = librosa.feature.melspectrogram
ret = logam(melgram(y=src, sr=SR, hop_length=HOP_LEN,
n_fft=N_FFT, n_mels=N_MELS)**2,
ref=1.0)
ret = ret[np.newaxis, :]
return ret