In [5]:
import scipy.io.wavfile
sample_rate, signal = scipy.io.wavfile.read('./wav/docs_feature_extraction_example.wav')

In [6]:
sample_rate

16000

In [7]:
signal

array([36, 37, 60, ...,  7,  9,  8], dtype=int16)

In [8]:
len(signal)

183280

In [10]:
len(signal) / sample_rate

11.455

In [11]:
signal = signal[0:int(3.5 * sample_rate)]

In [12]:
len(signal)

56000

In [13]:
import numpy as np
pre_emphasis = 0.97
emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])

In [15]:
emphasized_signal

array([  36.  ,    2.08,   24.11, ..., -233.76, -262.5 ,  -61.87])

In [16]:
frame_size = 0.025
frame_stride = 0.01
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_signal, z)
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + \
          np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]

In [17]:
pad_signal

array([36.  ,  2.08, 24.11, ...,  0.  ,  0.  ,  0.  ])

In [18]:
len(pad_signal)

56080

In [19]:
indices

array([[    0,     1,     2, ...,   397,   398,   399],
       [  160,   161,   162, ...,   557,   558,   559],
       [  320,   321,   322, ...,   717,   718,   719],
       ...,
       [55200, 55201, 55202, ..., 55597, 55598, 55599],
       [55360, 55361, 55362, ..., 55757, 55758, 55759],
       [55520, 55521, 55522, ..., 55917, 55918, 55919]])

In [20]:
indices.shape

(348, 400)

In [21]:
frames

array([[  36.  ,    2.08,   24.11, ...,    4.56,    3.74,    2.89],
       [  16.43,  -32.15,  -47.2 , ...,  -13.06,  -16.45,    2.07],
       [  -9.  ,   -9.27,   11.46, ...,   -5.09,   -7.24,   -2.45],
       ...,
       [ 315.7 ,  130.65,  211.81, ..., -121.15,  -17.69, -195.02],
       [ 283.62, 1098.42,  815.34, ...,   20.53,  136.92,  150.79],
       [ -59.03, -212.81, -289.18, ..., -157.35,  -81.12,   24.54]])

In [22]:
frames *= np.array([0.54 - 0.46 * np.cos((2 * np.pi * n) / (frame_length - 1)) for n in range(frame_length)])

In [23]:
NFFT = 512
dft_frames = np.fft.rfft(frames, NFFT)

In [24]:
mag_frames = np.absolute(dft_frames)

In [25]:
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))

In [26]:
dft_frames

array([[  18.58681572  +0.j        ,  -14.01988178 -84.91698726j,
         -24.70392533+106.88963159j, ...,  -26.66158624  -5.85474324j,
           0.92680879 +28.72849855j,   32.82338322  +0.j        ],
       [-142.51526149  +0.j        ,   85.6674828 +108.25845827j,
          13.38303476-108.51765447j, ...,  -10.58513364  +4.31215777j,
           7.05534013  -2.14342983j,   -3.03115655  +0.j        ],
       [ -19.2843489   +0.j        ,  -15.14198098 -16.21735682j,
         -40.12895986 +59.02120051j, ...,   -4.25775098 -14.59761671j,
         -10.25228518  +7.21787503j,    8.21971695  +0.j        ],
       ...,
       [ -20.99872977  +0.j        , -173.80587746 -75.85843408j,
        -120.10047358+121.33988075j, ...,  -36.66157943-135.12987296j,
        -150.42479757 -89.13659856j,  214.56477173  +0.j        ],
       [ 223.33492956  +0.j        ,  268.55310205 +78.36772313j,
         177.86153856 +13.40968462j, ..., -471.85194623-511.29061637j,
          45.11248225+373.13984952j

In [27]:
pow_frames

array([[6.74745544e-01, 1.44676793e+01, 2.35071822e+01, ...,
        1.45530898e+00, 1.61364376e+00, 2.10424704e+00],
       [3.96691402e+01, 3.72242410e+01, 2.33499745e+01, ...,
        2.55155779e-01, 1.06195539e-01, 1.79451368e-02],
       [7.26340063e-01, 9.61488770e-01, 9.94889752e+00, ...,
        4.51599330e-01, 3.07045061e-01, 1.31960443e-01],
       ...,
       [8.61223929e-01, 7.02402052e+01, 5.69286922e+01, ...,
        3.82893632e+01, 5.97127987e+01, 8.99180494e+01],
       [9.74189273e+01, 1.52855993e+02, 6.21377862e+01, ...,
        9.45434284e+02, 2.75915007e+02, 5.44215190e+01],
       [9.82519496e+00, 4.31041255e+01, 5.27870198e+00, ...,
        2.90334711e+02, 6.55008748e+01, 2.66659520e-01]])

In [28]:
nfilt = 40
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
bin = np.floor((NFFT + 1) * hz_points / sample_rate)

fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
    f_m_minus = int(bin[m - 1])   # left
    f_m = int(bin[m])             # center
    f_m_plus = int(bin[m + 1])    # right
    for k in range(f_m_minus, f_m):
        fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
    for k in range(f_m, f_m_plus):
        fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])


In [29]:
len(fbank[0])

257

In [30]:
len(fbank[39])

257

In [31]:
fbank[0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [32]:
fbank[39]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [33]:
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability

In [34]:
filter_banks = 20 * np.log10(filter_banks)  # dB

In [35]:
filter_banks

array([[23.20797748, 28.43010144, 12.25398612, ..., 40.6257818 ,
        39.71303889, 47.31537907],
       [31.41651703, 27.64136052,  6.05671685, ..., 33.94876132,
        31.78398749, 35.21455301],
       [-0.34111568, 23.76028182, 27.63086377, ..., 36.05665245,
        36.21223957, 38.67813892],
       ...,
       [36.93171543, 37.49741118, 79.87932729, ..., 85.35981132,
        65.0781584 , 75.43769198],
       [43.68564944, 37.76530122, 80.36305299, ..., 81.93777749,
        73.32696043, 84.03679199],
       [32.69037676, 24.13884613, 80.23978358, ..., 86.63723702,
        71.04168507, 78.27677254]])

In [36]:
filter_banks.shape

(348, 40)

In [37]:
from scipy.fftpack import dct
num_ceps = 12
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1 : (num_ceps + 1)] # Keep 2-13

In [38]:
mfcc.shape

(348, 12)

In [39]:
mfcc

array([[-70.61457095, -73.42417413,   6.03918874, ...,   0.41193953,
          0.52327877,   1.33707611],
       [-56.42592116, -68.28832959,   8.2060342 , ...,   8.15586847,
          0.12371646,  15.13425081],
       [-49.63784465, -62.84072546,  -1.38257895, ...,  -0.14776772,
         -0.92732454,  -7.98662188],
       ...,
       [-10.47629573, -43.35025103,  -2.78813316, ..., -15.00487819,
         -8.44861337, -18.41546277],
       [-13.00736419, -37.74980874,  -3.52627102, ...,  -9.43215238,
        -11.52338732, -14.32990337],
       [-14.05078172, -48.15574966,  -6.33121662, ..., -17.82431596,
        -10.26252646, -20.6654707 ]])

In [40]:
(nframes, ncoeff) = mfcc.shape
cep_lifter = 22
n = np.arange(ncoeff)
lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter)
mfcc *= lift

In [41]:
mfcc

array([[ -70.61457095, -188.36701827,   24.75498569, ...,    4.75972343,
           6.22075677,   16.04491329],
       [ -56.42592116, -175.191198  ,   33.63701118, ...,   94.23635187,
           1.4707457 ,  181.61100977],
       [ -49.63784465, -161.21556994,   -5.66727146, ...,   -1.70737067,
         -11.02406743,  -95.83946253],
       ...,
       [ -10.47629573, -111.21347463,  -11.4287199 , ..., -173.37270531,
        -100.43741869, -220.98555328],
       [ -13.00736419,  -96.84574591,  -14.45438987, ..., -108.98307567,
        -136.99044168, -171.95884042],
       [ -14.05078172, -123.54180462,  -25.95202492, ..., -205.94968109,
        -122.00128262, -247.98564834]])

여기까지가 MFCC post processing

In [42]:
filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)

In [43]:
filter_banks

array([[ -5.51767373,  -3.4808014 , -44.47846101, ..., -24.56746926,
        -21.40441976, -13.11285479],
       [  2.69086582,  -4.26954232, -50.67573028, ..., -31.24448974,
        -29.33347116, -25.21368086],
       [-29.06676688,  -8.15062102, -29.10158336, ..., -29.13659861,
        -24.90521909, -21.75009495],
       ...,
       [  8.20606423,   5.58650835,  23.14688016, ...,  20.16656026,
          3.96069974,  15.00945812],
       [ 14.95999823,   5.85439839,  23.63060586, ...,  16.74452643,
         12.20950178,  23.60855813],
       [  3.96472556,  -7.7720567 ,  23.50733646, ...,  21.44398596,
          9.92422641,  17.84853868]])

여기는 mean normalization