diff --git a/realbook/__init__.py b/realbook/__init__.py index 3c114bf..193e4ea 100644 --- a/realbook/__init__.py +++ b/realbook/__init__.py @@ -16,7 +16,7 @@ # limitations under the License. __author__ = "Spotify" -__version__ = "1.0.1" +__version__ = "1.0.2" __email__ = "realbook@spotify.com" __description__ = "Python libraries for easier machine learning on audio" __url__ = "https://github.com/spotify/realbook" diff --git a/realbook/callbacks/spectrogram_visualization.py b/realbook/callbacks/spectrogram_visualization.py index fbca66c..e700a31 100644 --- a/realbook/callbacks/spectrogram_visualization.py +++ b/realbook/callbacks/spectrogram_visualization.py @@ -106,7 +106,7 @@ def on_train_begin(self, logs: Any = None) -> None: with self.tensorboard_writer.as_default(): # Pull n random batches from the dataset and send them to TensorBoard. - for (data, _) in self.example_batches: + for data, _ in self.example_batches: assert tf.rank(data) == 2, "Expected input data to be of rank 2, with shape (batch, audio)." assert tf.shape(data)[0] < tf.shape(data)[1], ( "Expected input data to be of rank 2, with shape (batch, audio), but got shape" diff --git a/realbook/layers/signal.py b/realbook/layers/signal.py index 95848ca..9499423 100644 --- a/realbook/layers/signal.py +++ b/realbook/layers/signal.py @@ -18,11 +18,11 @@ import warnings from typing import Any, Callable, Dict, Optional, Union -import librosa import tensorflow as tf import numpy as np from realbook.layers.math import log_base_b +from realbook.vendor import librosa_filters def _create_padded_window( @@ -209,7 +209,7 @@ def build(self, input_shape: tf.TensorShape) -> None: self.fft_length ) # type: ignore - self.window_sum = librosa.filters.window_sumsquare( + self.window_sum = librosa_filters.window_sumsquare( # type: ignore window=self.window.numpy(), n_frames=input_shape[0] if input_shape.rank == 2 else input_shape[1], win_length=self.window_length, @@ -353,7 +353,7 @@ def build(self, input_shape: tf.TensorShape) -> None: super().build(input_shape) self.mel_weight_matrix = tf.constant( - librosa.filters.mel( + librosa_filters.mel( # type: ignore sr=self.sample_rate, n_fft=self.fft_length, n_mels=self.n_mels, diff --git a/realbook/vendor/__init__.py b/realbook/vendor/__init__.py new file mode 100644 index 0000000..d984363 --- /dev/null +++ b/realbook/vendor/__init__.py @@ -0,0 +1,16 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright 2023 Spotify AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/realbook/vendor/librosa_filters.py b/realbook/vendor/librosa_filters.py new file mode 100644 index 0000000..0a6d0f1 --- /dev/null +++ b/realbook/vendor/librosa_filters.py @@ -0,0 +1,918 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Copyright (c) 2013--2023, librosa development team. +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +# AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +# INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +# OF THIS SOFTWARE. + +# This is all copied from Librosa: don't bother type checking with MyPy. +# type: ignore + +import warnings +import numpy as np + +from numpy.typing import ArrayLike, DTypeLike +from typing import Optional, Union, Tuple, Any, Callable, Sequence, TypeVar +from typing_extensions import Literal + +_BoolLike_co = Union[bool, np.bool_] +_IntLike_co = Union[_BoolLike_co, int, "np.integer[Any]"] +_FloatLike_co = Union[_IntLike_co, float, "np.floating[Any]"] + +_WindowSpec = Union[str, Tuple[Any, ...], float, Callable[[int], np.ndarray], ArrayLike] +_T = TypeVar("_T") +_SequenceLike = Union[Sequence[_T], np.ndarray] +_ScalarOrSequence = Union[_T, _SequenceLike[_T]] + + +def tiny(x: Union[float, np.ndarray]) -> _FloatLike_co: + """Compute the tiny-value corresponding to an input's data type. + + This is the smallest "usable" number representable in ``x.dtype`` + (e.g., float32). + + This is primarily useful for determining a threshold for + numerical underflow in division or multiplication operations. + + Parameters + ---------- + x : number or np.ndarray + The array to compute the tiny-value for. + All that matters here is ``x.dtype`` + + Returns + ------- + tiny_value : float + The smallest positive usable number for the type of ``x``. + If ``x`` is integer-typed, then the tiny value for ``np.float32`` + is returned instead. + + See Also + -------- + numpy.finfo + + Examples + -------- + For a standard double-precision floating point number: + + >>> librosa.util.tiny(1.0) + 2.2250738585072014e-308 + + Or explicitly as double-precision + + >>> librosa.util.tiny(np.asarray(1e-5, dtype=np.float64)) + 2.2250738585072014e-308 + + Or complex numbers + + >>> librosa.util.tiny(1j) + 2.2250738585072014e-308 + + Single-precision floating point: + + >>> librosa.util.tiny(np.asarray(1e-5, dtype=np.float32)) + 1.1754944e-38 + + Integer + + >>> librosa.util.tiny(5) + 1.1754944e-38 + """ + + # Make sure we have an array view + x = np.asarray(x) + + # Only floating types generate a tiny + if np.issubdtype(x.dtype, np.floating) or np.issubdtype(x.dtype, np.complexfloating): + dtype = x.dtype + else: + dtype = np.dtype(np.float32) + + return np.finfo(dtype).tiny + + +def normalize( + S: np.ndarray, + *, + norm: Optional[float] = np.inf, + axis: Optional[int] = 0, + threshold: Optional[_FloatLike_co] = None, + fill: Optional[bool] = None, +) -> np.ndarray: + """Normalize an array along a chosen axis. + + Given a norm (described below) and a target axis, the input + array is scaled so that:: + + norm(S, axis=axis) == 1 + + For example, ``axis=0`` normalizes each column of a 2-d array + by aggregating over the rows (0-axis). + Similarly, ``axis=1`` normalizes each row of a 2-d array. + + This function also supports thresholding small-norm slices: + any slice (i.e., row or column) with norm below a specified + ``threshold`` can be left un-normalized, set to all-zeros, or + filled with uniform non-zero values that normalize to 1. + + Note: the semantics of this function differ from + `scipy.linalg.norm` in two ways: multi-dimensional arrays + are supported, but matrix-norms are not. + + Parameters + ---------- + S : np.ndarray + The array to normalize + + norm : {np.inf, -np.inf, 0, float > 0, None} + - `np.inf` : maximum absolute value + - `-np.inf` : minimum absolute value + - `0` : number of non-zeros (the support) + - float : corresponding l_p norm + See `scipy.linalg.norm` for details. + - None : no normalization is performed + + axis : int [scalar] + Axis along which to compute the norm. + + threshold : number > 0 [optional] + Only the columns (or rows) with norm at least ``threshold`` are + normalized. + + By default, the threshold is determined from + the numerical precision of ``S.dtype``. + + fill : None or bool + If None, then columns (or rows) with norm below ``threshold`` + are left as is. + + If False, then columns (rows) with norm below ``threshold`` + are set to 0. + + If True, then columns (rows) with norm below ``threshold`` + are filled uniformly such that the corresponding norm is 1. + + .. note:: ``fill=True`` is incompatible with ``norm=0`` because + no uniform vector exists with l0 "norm" equal to 1. + + Returns + ------- + S_norm : np.ndarray [shape=S.shape] + Normalized array + + Raises + ------ + ValueError + If ``norm`` is not among the valid types defined above + + If ``S`` is not finite + + If ``fill=True`` and ``norm=0`` + + See Also + -------- + scipy.linalg.norm + + Notes + ----- + This function caches at level 40. + + Examples + -------- + >>> # Construct an example matrix + >>> S = np.vander(np.arange(-2.0, 2.0)) + >>> S + array([[-8., 4., -2., 1.], + [-1., 1., -1., 1.], + [ 0., 0., 0., 1.], + [ 1., 1., 1., 1.]]) + >>> # Max (l-infinity)-normalize the columns + >>> librosa.util.normalize(S) + array([[-1. , 1. , -1. , 1. ], + [-0.125, 0.25 , -0.5 , 1. ], + [ 0. , 0. , 0. , 1. ], + [ 0.125, 0.25 , 0.5 , 1. ]]) + >>> # Max (l-infinity)-normalize the rows + >>> librosa.util.normalize(S, axis=1) + array([[-1. , 0.5 , -0.25 , 0.125], + [-1. , 1. , -1. , 1. ], + [ 0. , 0. , 0. , 1. ], + [ 1. , 1. , 1. , 1. ]]) + >>> # l1-normalize the columns + >>> librosa.util.normalize(S, norm=1) + array([[-0.8 , 0.667, -0.5 , 0.25 ], + [-0.1 , 0.167, -0.25 , 0.25 ], + [ 0. , 0. , 0. , 0.25 ], + [ 0.1 , 0.167, 0.25 , 0.25 ]]) + >>> # l2-normalize the columns + >>> librosa.util.normalize(S, norm=2) + array([[-0.985, 0.943, -0.816, 0.5 ], + [-0.123, 0.236, -0.408, 0.5 ], + [ 0. , 0. , 0. , 0.5 ], + [ 0.123, 0.236, 0.408, 0.5 ]]) + + >>> # Thresholding and filling + >>> S[:, -1] = 1e-308 + >>> S + array([[ -8.000e+000, 4.000e+000, -2.000e+000, + 1.000e-308], + [ -1.000e+000, 1.000e+000, -1.000e+000, + 1.000e-308], + [ 0.000e+000, 0.000e+000, 0.000e+000, + 1.000e-308], + [ 1.000e+000, 1.000e+000, 1.000e+000, + 1.000e-308]]) + + >>> # By default, small-norm columns are left untouched + >>> librosa.util.normalize(S) + array([[ -1.000e+000, 1.000e+000, -1.000e+000, + 1.000e-308], + [ -1.250e-001, 2.500e-001, -5.000e-001, + 1.000e-308], + [ 0.000e+000, 0.000e+000, 0.000e+000, + 1.000e-308], + [ 1.250e-001, 2.500e-001, 5.000e-001, + 1.000e-308]]) + >>> # Small-norm columns can be zeroed out + >>> librosa.util.normalize(S, fill=False) + array([[-1. , 1. , -1. , 0. ], + [-0.125, 0.25 , -0.5 , 0. ], + [ 0. , 0. , 0. , 0. ], + [ 0.125, 0.25 , 0.5 , 0. ]]) + >>> # Or set to constant with unit-norm + >>> librosa.util.normalize(S, fill=True) + array([[-1. , 1. , -1. , 1. ], + [-0.125, 0.25 , -0.5 , 1. ], + [ 0. , 0. , 0. , 1. ], + [ 0.125, 0.25 , 0.5 , 1. ]]) + >>> # With an l1 norm instead of max-norm + >>> librosa.util.normalize(S, norm=1, fill=True) + array([[-0.8 , 0.667, -0.5 , 0.25 ], + [-0.1 , 0.167, -0.25 , 0.25 ], + [ 0. , 0. , 0. , 0.25 ], + [ 0.1 , 0.167, 0.25 , 0.25 ]]) + """ + + # Avoid div-by-zero + if threshold is None: + threshold = tiny(S) + + elif threshold <= 0: + raise ValueError(f"threshold={threshold} must be strictly positive") + + if fill not in [None, False, True]: + raise ValueError(f"fill={fill} must be None or boolean") + + if not np.all(np.isfinite(S)): + raise ValueError("Input must be finite") + + # All norms only depend on magnitude, let's do that first + mag = np.abs(S).astype(float) + + # For max/min norms, filling with 1 works + fill_norm = 1 + + if norm is None: + return S + + elif norm == np.inf: + length = np.max(mag, axis=axis, keepdims=True) + + elif norm == -np.inf: + length = np.min(mag, axis=axis, keepdims=True) + + elif norm == 0: + if fill is True: + raise ValueError("Cannot normalize with norm=0 and fill=True") + + length = np.sum(mag > 0, axis=axis, keepdims=True, dtype=mag.dtype) + + elif np.issubdtype(type(norm), np.number) and norm > 0: + length = np.sum(mag**norm, axis=axis, keepdims=True) ** (1.0 / norm) + + if axis is None: + fill_norm = mag.size ** (-1.0 / norm) + else: + fill_norm = mag.shape[axis] ** (-1.0 / norm) + + else: + raise ValueError(f"Unsupported norm: {repr(norm)}") + + # indices where norm is below the threshold + small_idx = length < threshold + + Snorm = np.empty_like(S) + if fill is None: + # Leave small indices un-normalized + length[small_idx] = 1.0 + Snorm[:] = S / length + + elif fill: + # If we have a non-zero fill value, we locate those entries by + # doing a nan-divide. + # If S was finite, then length is finite (except for small positions) + length[small_idx] = np.nan + Snorm[:] = S / length + Snorm[np.isnan(Snorm)] = fill_norm + else: + # Set small values to zero by doing an inf-divide. + # This is safe (by IEEE-754) as long as S is finite. + length[small_idx] = np.inf + Snorm[:] = S / length + + return Snorm + + +def fft_frequencies(*, sr: float = 22050, n_fft: int = 2048) -> np.ndarray: + """Alternative implementation of `np.fft.fftfreq` + + Parameters + ---------- + sr : number > 0 [scalar] + Audio sampling rate + n_fft : int > 0 [scalar] + FFT window size + + Returns + ------- + freqs : np.ndarray [shape=(1 + n_fft/2,)] + Frequencies ``(0, sr/n_fft, 2*sr/n_fft, ..., sr/2)`` + + Examples + -------- + >>> librosa.fft_frequencies(sr=22050, n_fft=16) + array([ 0. , 1378.125, 2756.25 , 4134.375, + 5512.5 , 6890.625, 8268.75 , 9646.875, 11025. ]) + + """ + + return np.fft.rfftfreq(n=n_fft, d=1.0 / sr) + + +def hz_to_mel(frequencies: _ScalarOrSequence[_FloatLike_co], *, htk: bool = False) -> np.ndarray: + """Convert Hz to Mels + + Examples + -------- + >>> librosa.hz_to_mel(60) + 0.9 + >>> librosa.hz_to_mel([110, 220, 440]) + array([ 1.65, 3.3 , 6.6 ]) + + Parameters + ---------- + frequencies : number or np.ndarray [shape=(n,)] , float + scalar or array of frequencies + htk : bool + use HTK formula instead of Slaney + + Returns + ------- + mels : number or np.ndarray [shape=(n,)] + input frequencies in Mels + + See Also + -------- + mel_to_hz + """ + + frequencies = np.asanyarray(frequencies) + + if htk: + mels: np.ndarray = 2595.0 * np.log10(1.0 + frequencies / 700.0) + return mels + + # Fill in the linear part + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (frequencies - f_min) / f_sp + + # Fill in the log-scale part + + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = np.log(6.4) / 27.0 # step size for log region + + if frequencies.ndim: + # If we have array data, vectorize + log_t = frequencies >= min_log_hz + mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep + elif frequencies >= min_log_hz: + # If we have scalar data, heck directly + mels = min_log_mel + np.log(frequencies / min_log_hz) / logstep + + return mels + + +def mel_to_hz(mels: _ScalarOrSequence[_FloatLike_co], *, htk: bool = False) -> np.ndarray: + """Convert mel bin numbers to frequencies + + Examples + -------- + >>> librosa.mel_to_hz(3) + 200. + + >>> librosa.mel_to_hz([1,2,3,4,5]) + array([ 66.667, 133.333, 200. , 266.667, 333.333]) + + Parameters + ---------- + mels : np.ndarray [shape=(n,)], float + mel bins to convert + htk : bool + use HTK formula instead of Slaney + + Returns + ------- + frequencies : np.ndarray [shape=(n,)] + input mels in Hz + + See Also + -------- + hz_to_mel + """ + + mels = np.asanyarray(mels) + + if htk: + return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) + + # Fill in the linear scale + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mels + + # And now the nonlinear scale + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = np.log(6.4) / 27.0 # step size for log region + + if mels.ndim: + # If we have vector data, vectorize + log_t = mels >= min_log_mel + freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel)) + elif mels >= min_log_mel: + # If we have scalar data, check directly + freqs = min_log_hz * np.exp(logstep * (mels - min_log_mel)) + + return freqs + + +def mel_frequencies(n_mels: int = 128, *, fmin: float = 0.0, fmax: float = 11025.0, htk: bool = False) -> np.ndarray: + """Compute an array of acoustic frequencies tuned to the mel scale. + + The mel scale is a quasi-logarithmic function of acoustic frequency + designed such that perceptually similar pitch intervals (e.g. octaves) + appear equal in width over the full hearing range. + + Because the definition of the mel scale is conditioned by a finite number + of subjective psychoaoustical experiments, several implementations coexist + in the audio signal processing literature [#]_. By default, librosa replicates + the behavior of the well-established MATLAB Auditory Toolbox of Slaney [#]_. + According to this default implementation, the conversion from Hertz to mel is + linear below 1 kHz and logarithmic above 1 kHz. Another available implementation + replicates the Hidden Markov Toolkit [#]_ (HTK) according to the following formula:: + + mel = 2595.0 * np.log10(1.0 + f / 700.0). + + The choice of implementation is determined by the ``htk`` keyword argument: setting + ``htk=False`` leads to the Auditory toolbox implementation, whereas setting it ``htk=True`` + leads to the HTK implementation. + + .. [#] Umesh, S., Cohen, L., & Nelson, D. Fitting the mel scale. + In Proc. International Conference on Acoustics, Speech, and Signal Processing + (ICASSP), vol. 1, pp. 217-220, 1998. + + .. [#] Slaney, M. Auditory Toolbox: A MATLAB Toolbox for Auditory + Modeling Work. Technical Report, version 2, Interval Research Corporation, 1998. + + .. [#] Young, S., Evermann, G., Gales, M., Hain, T., Kershaw, D., Liu, X., + Moore, G., Odell, J., Ollason, D., Povey, D., Valtchev, V., & Woodland, P. + The HTK book, version 3.4. Cambridge University, March 2009. + + See Also + -------- + hz_to_mel + mel_to_hz + librosa.feature.melspectrogram + librosa.feature.mfcc + + Parameters + ---------- + n_mels : int > 0 [scalar] + Number of mel bins. + fmin : float >= 0 [scalar] + Minimum frequency (Hz). + fmax : float >= 0 [scalar] + Maximum frequency (Hz). + htk : bool + If True, use HTK formula to convert Hz to mel. + Otherwise (False), use Slaney's Auditory Toolbox. + + Returns + ------- + bin_frequencies : ndarray [shape=(n_mels,)] + Vector of ``n_mels`` frequencies in Hz which are uniformly spaced on the Mel + axis. + + Examples + -------- + >>> librosa.mel_frequencies(n_mels=40) + array([ 0. , 85.317, 170.635, 255.952, + 341.269, 426.586, 511.904, 597.221, + 682.538, 767.855, 853.173, 938.49 , + 1024.856, 1119.114, 1222.042, 1334.436, + 1457.167, 1591.187, 1737.532, 1897.337, + 2071.84 , 2262.393, 2470.47 , 2697.686, + 2945.799, 3216.731, 3512.582, 3835.643, + 4188.417, 4573.636, 4994.285, 5453.621, + 5955.205, 6502.92 , 7101.009, 7754.107, + 8467.272, 9246.028, 10096.408, 11025. ]) + + """ + + # 'Center freqs' of mel bands - uniformly spaced between limits + min_mel = hz_to_mel(fmin, htk=htk) + max_mel = hz_to_mel(fmax, htk=htk) + + mels = np.linspace(min_mel, max_mel, n_mels) + + hz: np.ndarray = mel_to_hz(mels, htk=htk) + return hz + + +def mel( + *, + sr: float, + n_fft: int, + n_mels: int = 128, + fmin: float = 0.0, + fmax: Optional[float] = None, + htk: bool = False, + norm: Optional[Union[Literal["slaney"], float]] = "slaney", + dtype: DTypeLike = np.float32, +) -> np.ndarray: + """Create a Mel filter-bank. + + This produces a linear transformation matrix to project + FFT bins onto Mel-frequency bins. + + Parameters + ---------- + sr : number > 0 [scalar] + sampling rate of the incoming signal + + n_fft : int > 0 [scalar] + number of FFT components + + n_mels : int > 0 [scalar] + number of Mel bands to generate + + fmin : float >= 0 [scalar] + lowest frequency (in Hz) + + fmax : float >= 0 [scalar] + highest frequency (in Hz). + If `None`, use ``fmax = sr / 2.0`` + + htk : bool [scalar] + use HTK formula instead of Slaney + + norm : {None, 'slaney', or number} [scalar] + If 'slaney', divide the triangular mel weights by the width of the mel band + (area normalization). + + If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm. + See `librosa.util.normalize` for a full description of supported norm values + (including `+-np.inf`). + + Otherwise, leave all the triangles aiming for a peak value of 1.0 + + dtype : np.dtype + The data type of the output basis. + By default, uses 32-bit (single-precision) floating point. + + Returns + ------- + M : np.ndarray [shape=(n_mels, 1 + n_fft/2)] + Mel transform matrix + + See Also + -------- + librosa.util.normalize + + Notes + ----- + This function caches at level 10. + + Examples + -------- + >>> melfb = librosa.filters.mel(sr=22050, n_fft=2048) + >>> melfb + array([[ 0. , 0.016, ..., 0. , 0. ], + [ 0. , 0. , ..., 0. , 0. ], + ..., + [ 0. , 0. , ..., 0. , 0. ], + [ 0. , 0. , ..., 0. , 0. ]]) + + Clip the maximum frequency to 8KHz + + >>> librosa.filters.mel(sr=22050, n_fft=2048, fmax=8000) + array([[ 0. , 0.02, ..., 0. , 0. ], + [ 0. , 0. , ..., 0. , 0. ], + ..., + [ 0. , 0. , ..., 0. , 0. ], + [ 0. , 0. , ..., 0. , 0. ]]) + + >>> import matplotlib.pyplot as plt + >>> fig, ax = plt.subplots() + >>> img = librosa.display.specshow(melfb, x_axis='linear', ax=ax) + >>> ax.set(ylabel='Mel filter', title='Mel filter bank') + >>> fig.colorbar(img, ax=ax) + """ + + if fmax is None: + fmax = float(sr) / 2 + + # Initialize the weights + n_mels = int(n_mels) + weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) + + # Center freqs of each FFT bin + fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft) + + # 'Center freqs' of mel bands - uniformly spaced between limits + mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk) + + fdiff = np.diff(mel_f) + ramps = np.subtract.outer(mel_f, fftfreqs) + + for i in range(n_mels): + # lower and upper slopes for all bins + lower = -ramps[i] / fdiff[i] + upper = ramps[i + 2] / fdiff[i + 1] + + # .. then intersect them with each other and zero + weights[i] = np.maximum(0, np.minimum(lower, upper)) + + if isinstance(norm, str): + if norm == "slaney": + # Slaney-style mel is scaled to be approx constant energy per channel + enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels]) + weights *= enorm[:, np.newaxis] + else: + raise ValueError(f"Unsupported norm={norm}") + else: + weights = normalize(weights, norm=norm, axis=-1) + + # Only check weights if f_mel[0] is positive + if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)): + # This means we have an empty channel somewhere + warnings.warn( + "Empty filters detected in mel frequency basis. " + "Some channels will produce empty responses. " + "Try increasing your sampling rate (and fmax) or " + "reducing n_mels.", + stacklevel=2, + ) + + return weights + + +def pad_center(data: np.ndarray, *, size: int, axis: int = -1, **kwargs: Any) -> np.ndarray: + """Pad an array to a target length along a target axis. + + This differs from `np.pad` by centering the data prior to padding, + analogous to `str.center` + + Examples + -------- + >>> # Generate a vector + >>> data = np.ones(5) + >>> librosa.util.pad_center(data, size=10, mode='constant') + array([ 0., 0., 1., 1., 1., 1., 1., 0., 0., 0.]) + + >>> # Pad a matrix along its first dimension + >>> data = np.ones((3, 5)) + >>> librosa.util.pad_center(data, size=7, axis=0) + array([[ 0., 0., 0., 0., 0.], + [ 0., 0., 0., 0., 0.], + [ 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1.], + [ 1., 1., 1., 1., 1.], + [ 0., 0., 0., 0., 0.], + [ 0., 0., 0., 0., 0.]]) + >>> # Or its second dimension + >>> librosa.util.pad_center(data, size=7, axis=1) + array([[ 0., 1., 1., 1., 1., 1., 0.], + [ 0., 1., 1., 1., 1., 1., 0.], + [ 0., 1., 1., 1., 1., 1., 0.]]) + + Parameters + ---------- + data : np.ndarray + Vector to be padded and centered + size : int >= len(data) [scalar] + Length to pad ``data`` + axis : int + Axis along which to pad and center the data + **kwargs : additional keyword arguments + arguments passed to `np.pad` + + Returns + ------- + data_padded : np.ndarray + ``data`` centered and padded to length ``size`` along the + specified axis + + Raises + ------ + ValueError + If ``size < data.shape[axis]`` + + See Also + -------- + numpy.pad + """ + + kwargs.setdefault("mode", "constant") + + n = data.shape[axis] + + lpad = int((size - n) // 2) + + lengths = [(0, 0)] * data.ndim + lengths[axis] = (lpad, int(size - n - lpad)) + + if lpad < 0: + raise ValueError(f"Target size ({size:d}) must be at least input size ({n:d})") + + return np.pad(data, lengths, **kwargs) + + +def __window_ss_fill(x, win_sq, n_frames, hop_length): # pragma: no cover + """Helper function for window sum-square calculation.""" + + n = len(x) + n_fft = len(win_sq) + for i in range(n_frames): + sample = i * hop_length + x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] + + +def get_window( + window: _WindowSpec, + Nx: int, + *, + fftbins: Optional[bool] = True, +) -> np.ndarray: + """Compute a window function. + + This is a wrapper for `scipy.signal.get_window` that additionally + supports callable or pre-computed windows. + + Parameters + ---------- + window : string, tuple, number, callable, or list-like + The window specification: + + - If string, it's the name of the window function (e.g., `'hann'`) + - If tuple, it's the name of the window function and any parameters + (e.g., `('kaiser', 4.0)`) + - If numeric, it is treated as the beta parameter of the `'kaiser'` + window, as in `scipy.signal.get_window`. + - If callable, it's a function that accepts one integer argument + (the window length) + - If list-like, it's a pre-computed window of the correct length `Nx` + + Nx : int > 0 + The length of the window + + fftbins : bool, optional + If True (default), create a periodic window for use with FFT + If False, create a symmetric window for filter design applications. + + Returns + ------- + get_window : np.ndarray + A window of length `Nx` and type `window` + + See Also + -------- + scipy.signal.get_window + + Notes + ----- + This function caches at level 10. + + Raises + ------ + ValueError + If `window` is supplied as a vector of length != `n_fft`, + or is otherwise mis-specified. + """ + if callable(window): + return window(Nx) + + elif isinstance(window, (str, tuple)) or np.isscalar(window): + # TODO: if we add custom window functions in librosa, call them here + + try: + import scipy + except ImportError: + raise NotImplementedError("Scipy not included in Realbook's vendored Librosa code.") + + win: np.ndarray = scipy.signal.get_window(window, Nx, fftbins=fftbins) + return win + + elif isinstance(window, (np.ndarray, list)): + if len(window) == Nx: + return np.asarray(window) + + raise ValueError(f"Window size mismatch: {len(window):d} != {Nx:d}") + else: + raise ValueError(f"Invalid window specification: {window!r}") + + +def window_sumsquare( + *, + window: _WindowSpec, + n_frames: int, + hop_length: int = 512, + win_length: Optional[int] = None, + n_fft: int = 2048, + dtype: DTypeLike = np.float32, + norm: Optional[float] = None, +) -> np.ndarray: + """Compute the sum-square envelope of a window function at a given hop length. + + This is used to estimate modulation effects induced by windowing observations + in short-time Fourier transforms. + + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + n_frames : int > 0 + The number of analysis frames + hop_length : int > 0 + The number of samples to advance between frames + win_length : [optional] + The length of the window function. By default, this matches ``n_fft``. + n_fft : int > 0 + The length of each analysis frame. + dtype : np.dtype + The data type of the output + norm : {np.inf, -np.inf, 0, float > 0, None} + Normalization mode used in window construction. + Note that this does not affect the squaring operation. + + Returns + ------- + wss : np.ndarray, shape=``(n_fft + hop_length * (n_frames - 1))`` + The sum-squared envelope of the window function + + Examples + -------- + For a fixed frame length (2048), compare modulation effects for a Hann window + at different hop lengths: + + >>> n_frames = 50 + >>> wss_256 = librosa.filters.window_sumsquare(window='hann', n_frames=n_frames, hop_length=256) + >>> wss_512 = librosa.filters.window_sumsquare(window='hann', n_frames=n_frames, hop_length=512) + >>> wss_1024 = librosa.filters.window_sumsquare(window='hann', n_frames=n_frames, hop_length=1024) + + >>> import matplotlib.pyplot as plt + >>> fig, ax = plt.subplots(nrows=3, sharey=True) + >>> ax[0].plot(wss_256) + >>> ax[0].set(title='hop_length=256') + >>> ax[1].plot(wss_512) + >>> ax[1].set(title='hop_length=512') + >>> ax[2].plot(wss_1024) + >>> ax[2].set(title='hop_length=1024') + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length) + win_sq = normalize(win_sq, norm=norm) ** 2 + win_sq = pad_center(win_sq, size=n_fft) + + # Fill the envelope + __window_ss_fill(x, win_sq, n_frames, hop_length) + + return x diff --git a/setup.cfg b/setup.cfg index 660e65c..67ad185 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.0.1 +current_version = 1.0.2 commit = True tag = True @@ -35,20 +35,20 @@ include_package_data = True install_requires = tensorflow>=2.4; sys_platform != 'darwin' or platform.machine != 'arm64' tensorflow-macos>=2.4; sys_platform == 'darwin' and platform.machine == 'arm64' - tensorboard>=2.4 - librosa>=0.9,<0.10 + tensorboard types-protobuf + numpy + typing_extensions [options.extras_require] dev = realbook[tensorboard,test] bumpversion>=0.5.3 - ipython - ipdb tensorboard = matplotlib psutil nvsmi + librosa>=0.9,<0.10 test = coverage>=5.0.2 pytest>=7.1.1 @@ -57,8 +57,13 @@ test = tox torch nnaudio + numpy==1.21.6 + librosa>=0.9,<0.10 + tensorflow>=2.4,<2.11; sys_platform != 'darwin' or platform.machine != 'arm64' + tensorflow-macos>=2.4,<2.11; sys_platform == 'darwin' and platform.machine == 'arm64' [bumpversion:file:realbook/__init__.py] [bdist_wheel] universal = 1 + diff --git a/tests/callbacks/test_spectrogram_visualization.py b/tests/callbacks/test_spectrogram_visualization.py index 7762978..62d514c 100644 --- a/tests/callbacks/test_spectrogram_visualization.py +++ b/tests/callbacks/test_spectrogram_visualization.py @@ -17,11 +17,19 @@ from typing import Any +import platform import pytest import numpy as np import tensorflow as tf -from realbook.callbacks.spectrogram_visualization import SpectrogramVisualizationCallback +try: + from realbook.callbacks.spectrogram_visualization import SpectrogramVisualizationCallback +except ImportError as e: + if "numpy.core.multiarray failed to import" in str(e) and platform.system() == "Windows": + SpectrogramVisualizationCallback = None # type: ignore + else: + raise + from realbook.layers.signal import Spectrogram @@ -52,6 +60,10 @@ def flush(self) -> None: TEST_AUDIO = np.linspace(0, 1, num=DEFAULT_SAMPLE_RATE * 10) +@pytest.mark.skipif( + SpectrogramVisualizationCallback is None, + reason="SpectrogramVisualizationCallback import fails on this platform", +) def test_spectrogram_visualization_callback() -> None: fake_data = tf.data.Dataset.zip( ( @@ -80,6 +92,10 @@ def test_spectrogram_visualization_callback() -> None: assert True +@pytest.mark.skipif( + SpectrogramVisualizationCallback is None, + reason="SpectrogramVisualizationCallback import fails on this platform", +) def test_callback_fails_on_unbatched_input() -> None: fake_data = tf.data.Dataset.zip( ( @@ -110,6 +126,10 @@ def test_callback_fails_on_unbatched_input() -> None: assert "shape" in str(excinfo.value) +@pytest.mark.skipif( + SpectrogramVisualizationCallback is None, + reason="SpectrogramVisualizationCallback import fails on this platform", +) def test_callback_logs_but_doesnt_throw_by_default(caplog: pytest.LogCaptureFixture) -> None: fake_data = tf.data.Dataset.zip( ( @@ -133,6 +153,10 @@ def test_callback_logs_but_doesnt_throw_by_default(caplog: pytest.LogCaptureFixt assert "shape" in caplog.text +@pytest.mark.skipif( + SpectrogramVisualizationCallback is None, + reason="SpectrogramVisualizationCallback import fails on this platform", +) def test_fails_on_no_image_like_layers() -> None: fake_data = tf.data.Dataset.zip( ( @@ -162,6 +186,10 @@ def test_fails_on_no_image_like_layers() -> None: assert "spectrogram" in str(excinfo.value) +@pytest.mark.skipif( + SpectrogramVisualizationCallback is None, + reason="SpectrogramVisualizationCallback import fails on this platform", +) def test_flexible_with_input_shapes() -> None: fake_data = tf.data.Dataset.zip( ( @@ -192,6 +220,10 @@ def test_flexible_with_input_shapes() -> None: assert True +@pytest.mark.skipif( + SpectrogramVisualizationCallback is None, + reason="SpectrogramVisualizationCallback import fails on this platform", +) def test_keras_functional_api_with_tfop_lambda() -> None: fake_data = tf.data.Dataset.zip( ( diff --git a/tests/layers/test_nnaudio.py b/tests/layers/test_nnaudio.py index 9726d01..8e0ce86 100644 --- a/tests/layers/test_nnaudio.py +++ b/tests/layers/test_nnaudio.py @@ -19,50 +19,27 @@ import torch import numpy as np import pytest -import librosa -import librosa.display +import platform -from typing import List, Tuple, Union +try: + import librosa + from realbook.layers import nnaudio as our_nnaudio + from nnAudio.Spectrogram import CQT2010v2 +except ImportError as e: + if "numpy.core.multiarray failed to import" in str(e) and platform.system() == "Windows": + librosa = None + our_nnaudio = None # type: ignore + CQT2010v2 = None + else: + raise -from realbook.layers import nnaudio as our_nnaudio -from nnAudio.Spectrogram import CQT2010v2 - -TEST_SAMPLE_RATE = 22050 +from typing import Tuple, Union -# Test using this model directly, as well as wrapping it in a Lambda layer. -def get_parameterized_model_variants( - match_torch_exactly_values: Tuple[bool, bool] = (True, False) -) -> List[tf.keras.layers.Layer]: - possible_models = [ - our_nnaudio.CQT(match_torch_exactly=v, trainable=trainable) - for v in match_torch_exactly_values - for trainable in (True, False) - ] - - return [ - item - for models in [ - [tf.keras.Sequential([tf.keras.layers.InputLayer((TEST_SAMPLE_RATE,)), model])] - + ( - [ - tf.keras.Sequential( - [ - tf.keras.layers.InputLayer((TEST_SAMPLE_RATE,)), - tf.keras.layers.Lambda(lambda x: model(x)), - ] - ) - ] - # Using a layer with trainable weights inside a Lambda layer isn't supported. - if not model.trainable - else [] - ) - for model in possible_models - ] - for item in models - ] +TEST_SAMPLE_RATE = 22050 +@pytest.mark.skipif(librosa is None, reason="Librosa failed to import on this platform.") @pytest.mark.parametrize( "match_torch_exactly,threshold,trainable", ( @@ -83,12 +60,14 @@ def test_cqt(match_torch_exactly: bool, threshold: float, trainable: bool) -> No def build_layer( - layer: tf.keras.layers.Layer, input_shape: Union[Tuple[int], Tuple[int, int]] = (1, TEST_SAMPLE_RATE) + layer: tf.keras.layers.Layer, + input_shape: Union[Tuple[int], Tuple[int, int]] = (1, TEST_SAMPLE_RATE), ) -> tf.keras.layers.Layer: layer.build(input_shape) return layer +@pytest.mark.skipif(our_nnaudio is None, reason="nnaudio failed to import on this platform.") def test_cqt_trainable_weights() -> None: assert not build_layer(our_nnaudio.CQT(trainable=False)).trainable assert not build_layer(our_nnaudio.CQT(trainable=False)).trainable_weights @@ -98,6 +77,8 @@ def test_cqt_trainable_weights() -> None: assert len(build_layer(our_nnaudio.CQT(trainable=True)).trainable_weights) == 2 +@pytest.mark.skipif(librosa is None, reason="Librosa failed to import on this platform.") +@pytest.mark.skipif(our_nnaudio is None, reason="nnaudio failed to import on this platform.") @pytest.mark.parametrize("train", (True, False)) def test_cqt_trainable_layers_change_on_training(train: bool) -> None: # Make a model that's trainable, then train it and ensure the weights change from the default. diff --git a/tests/layers/test_signal.py b/tests/layers/test_signal.py index b1f7dfb..c9c8419 100644 --- a/tests/layers/test_signal.py +++ b/tests/layers/test_signal.py @@ -17,12 +17,20 @@ from typing import Optional, Union, List -import librosa +import platform import numpy as np import pytest import tensorflow as tf -from librosa.core.spectrum import _spectrogram -from librosa.feature.spectral import melspectrogram + +try: + import librosa + from librosa.core.spectrum import _spectrogram + from librosa.feature.spectral import melspectrogram +except ImportError as e: + if "numpy.core.multiarray failed to import" in str(e) and platform.system() == "Windows": + librosa = None + else: + raise from realbook.layers import signal @@ -39,6 +47,7 @@ def test_stft_channels_should_raise() -> None: )(x) +@pytest.mark.skipif(librosa is None, reason="Librosa failed to import on this platform.") @pytest.mark.parametrize( "center,input_length,fft_length,hop_length,win_length", [ @@ -68,6 +77,7 @@ def test_stft(center: bool, input_length: int, fft_length: int, hop_length: int, assert np.allclose(librosa_stft.imag, rgp_stft.imag, atol=1e-3, rtol=0) +@pytest.mark.skipif(librosa is None, reason="Librosa failed to import on this platform.") def test_stft_batch() -> None: x = np.random.normal(0, 1, 1024) librosa_stft = librosa.stft( @@ -105,6 +115,7 @@ def test_istft_channels_should_raise() -> None: )(tf.expand_dims(x_stft, -1)) +@pytest.mark.skipif(librosa is None, reason="Librosa failed to import on this platform.") @pytest.mark.parametrize( "center,input_length,fft_length,hop_length,win_length", [ @@ -185,6 +196,7 @@ def test_istft_batch() -> None: assert np.allclose(x, np.squeeze(x_istft), atol=1e-3, rtol=0) +@pytest.mark.skipif(librosa is None, reason="Librosa failed to import on this platform.") def test_spectrogram() -> None: x = np.random.normal(0, 1, 1024).astype(np.float32) librosa_spec, _ = _spectrogram( @@ -205,6 +217,7 @@ def test_spectrogram() -> None: assert np.allclose(librosa_spec, rgp_spec, atol=1e-2, rtol=0) +@pytest.mark.skipif(librosa is None, reason="Librosa failed to import on this platform.") @pytest.mark.parametrize( "center,normalization,fmin,htk", [ @@ -248,6 +261,7 @@ def test_mel_spectrogram( assert np.allclose(librosa_spec, rgp_spec, atol=1e-2, rtol=0) +@pytest.mark.skipif(librosa is None, reason="Librosa failed to import on this platform.") @pytest.mark.parametrize( "input_spec", [ @@ -287,6 +301,7 @@ def test_magnitude(input_spec: Optional[List[np.complex64]]) -> None: assert np.allclose(np_magnitude, layer_magnitude, atol=1e-3, rtol=0) +@pytest.mark.skipif(librosa is None, reason="Librosa failed to import on this platform.") @pytest.mark.parametrize( "input_spec", [ @@ -326,6 +341,7 @@ def test_phase(input_spec: Optional[List[np.complex64]]) -> None: assert np.allclose(np_phase, layer_phase, atol=1e-3, rtol=0) +@pytest.mark.skipif(librosa is None, reason="Librosa failed to import on this platform.") @pytest.mark.parametrize( "ref,amin,top_db", [