/
mfcc.py
92 lines (76 loc) · 3.04 KB
/
mfcc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#
# Copyright (C) 2016-2019 by Nathan Lovato, Daniel Oakey, Razvan Radulescu, and contributors
#
# This file is part of Power Sequencer.
#
# Power Sequencer is free software: you can redistribute it and/or modify it under the terms of the
# GNU General Public License as published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# Power Sequencer is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
# without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with Power Sequencer. If
# not, see <https://www.gnu.org/licenses/>.
#
import numpy as np
from .trfbank import trfbank
from .segment_axis import segment_axis
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13):
"""Compute Mel Frequency Cepstral Coefficients.
Parameters
----------
input: ndarray
input from which the coefficients are computed
Returns
-------
ceps: ndarray
Mel-cepstrum coefficients
mspec: ndarray
Log-spectrum in the mel-domain.
Notes
-----
MFCC are computed as follows:
* Pre-processing in time-domain (pre-emphasizing)
* Compute the spectrum amplitude by windowing with a Hamming window
* Filter the signal in the spectral domain with a triangular
filter-bank, whose filters are approximatively linearly spaced on the
mel scale, and have equal bandwidth in the mel scale
* Compute the DCT of the log-spectrum
References
----------
.. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric
representations for monosyllabic word recognition in continuously
spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc.
ASSP-28 (4): 357-366, August 1980."""
from scipy.signal import hamming, lfilter
from scipy.fftpack import fft
from scipy.fftpack.realtransforms import dct
# MFCC parameters: taken from auditory toolbox
over = nwin - 160
# Pre-emphasis factor (to take into account the -6dB/octave rolloff of the
# radiation at the lips level)
prefac = 0.97
# lowfreq = 400 / 3.
lowfreq = 133.33
# highfreq = 6855.4976
linsc = 200 / 3.0
logsc = 1.0711703
nlinfil = 13
nlogfil = 27
nfil = nlinfil + nlogfil
w = hamming(nwin, sym=0)
fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0]
# ------------------
# Compute the MFCC
# ------------------
extract = lfilter([1.0, -prefac], 1, input)
framed = segment_axis(extract, nwin, over) * w
# Compute the spectrum magnitude
spec = np.abs(fft(framed, nfft, axis=-1))
# Filter the spectrum through the triangle filterbank
mspec = np.log10(np.dot(spec, fbank.T))
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
ceps = dct(mspec, type=2, norm="ortho", axis=-1)[:, :nceps]
return ceps, mspec, spec