/
gmm.py
247 lines (209 loc) · 9.62 KB
/
gmm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import numpy as np
from nnmnkwii.paramgen import mlpg
from scipy import linalg
from sklearn.mixture import GaussianMixture
# ref: https://github.com/scikit-learn/scikit-learn/blob/0.24.1/sklearn/mixture/
def _compute_precision_cholesky(covariances, covariance_type):
estimate_precision_error_message = (
"Fitting the mixture model failed because some components have "
"ill-defined empirical covariance (for instance caused by singleton "
"or collapsed samples). Try to decrease the number of components, "
"or increase reg_covar."
)
if covariance_type == "full":
n_components, n_features, _ = covariances.shape
precisions_chol = np.empty((n_components, n_features, n_features))
for k, covariance in enumerate(covariances):
try:
cov_chol = linalg.cholesky(covariance, lower=True)
except linalg.LinAlgError:
raise ValueError(estimate_precision_error_message)
precisions_chol[k] = linalg.solve_triangular(
cov_chol, np.eye(n_features), lower=True
).T
elif covariance_type == "tied":
_, n_features = covariances.shape
try:
cov_chol = linalg.cholesky(covariances, lower=True)
except linalg.LinAlgError:
raise ValueError(estimate_precision_error_message)
precisions_chol = linalg.solve_triangular(
cov_chol, np.eye(n_features), lower=True
).T
else:
if np.any(np.less_equal(covariances, 0.0)):
raise ValueError(estimate_precision_error_message)
precisions_chol = 1.0 / np.sqrt(covariances)
return precisions_chol
# TODO: this can be refactored to be more flexible
# e.g. take `swap` and `diff` out of the class
class MLPGBase(object):
def __init__(self, gmm, swap=False, diff=False):
assert gmm.covariance_type == "full"
# D: static + delta dim
D = gmm.means_.shape[1] // 2
self.num_mixtures = gmm.means_.shape[0]
self.weights = gmm.weights_
# Split source and target parameters from joint GMM
self.src_means = gmm.means_[:, :D]
self.tgt_means = gmm.means_[:, D:]
self.covarXX = gmm.covariances_[:, :D, :D]
self.covarXY = gmm.covariances_[:, :D, D:]
self.covarYX = gmm.covariances_[:, D:, :D]
self.covarYY = gmm.covariances_[:, D:, D:]
if diff:
self.tgt_means = self.tgt_means - self.src_means
self.covarYY = self.covarXX + self.covarYY - self.covarXY - self.covarYX
self.covarXY = self.covarXY - self.covarXX
self.covarYX = self.covarXY.transpose(0, 2, 1)
# swap src and target parameters
if swap:
self.tgt_means, self.src_means = self.src_means, self.tgt_means
self.covarYY, self.covarXX = self.covarXX, self.covarYY
self.covarYX, self.covarXY = self.covarXY, self.covarYX
# p(x), which is used to compute posterior prob. for a given source
# spectral feature in mapping stage.
self.px = GaussianMixture(
n_components=self.num_mixtures, covariance_type="full"
)
self.px.means_ = self.src_means
self.px.covariances_ = self.covarXX
self.px.weights_ = self.weights
self.px.precisions_cholesky_ = _compute_precision_cholesky(
self.px.covariances_, "full"
)
def transform(self, src):
if src.ndim == 2:
tgt = np.zeros_like(src)
for idx, x in enumerate(src):
y = self._transform_frame(x)
tgt[idx][: len(y)] = y
return tgt
else:
return self._transform_frame(src)
def _transform_frame(self, src):
"""Mapping source spectral feature x to target spectral feature y
so that minimize the mean least squared error.
More specifically, it returns the value E(p(y|x)].
Args:
src (array): shape (`order of spectral feature`) source speaker's
spectral feature that will be transformed
Returns:
array: converted spectral feature
"""
D = len(src)
# Eq.(11)
E = np.zeros((self.num_mixtures, D))
for m in range(self.num_mixtures):
xx = np.linalg.solve(self.covarXX[m], src - self.src_means[m])
E[m] = self.tgt_means[m] + self.covarYX[m].dot(xx)
# Eq.(9) p(m|x)
posterior = self.px.predict_proba(np.atleast_2d(src))
# Eq.(13) conditinal mean E[p(y|x)]
return posterior.dot(E).flatten()
class MLPG(MLPGBase):
"""Maximum likelihood Parameter Generation (MLPG) for GMM-basd voice
conversion [1]_.
Notes:
- Source speaker's feature: ``X = {x_t}, 0 <= t < T``
- Target speaker's feature: ``Y = {y_t}, 0 <= t < T``
where T is the number of time frames.
See papar [1]_ for details.
The code was adapted from https://gist.github.com/r9y9/88bda659c97f46f42525.
Args:
gmm (sklearn.mixture.GaussianMixture): Gaussian Mixture Models of
source and target joint features.
windows (list): List of windows. See :func:`nnmnkwii.functions.mlpg` for
details.
swap (bool): If True, source -> target, otherwise target -> source.
diff (bool): Convert GMM -> DIFFGMM if True.
Attributes:
num_mixtures (int): The number of Gaussian mixtures
weights (array): shape (`num_mixtures`), weights for each gaussian
src_means (array): shape (`num_mixtures`, `order of spectral feature`)
means of GMM for a source speaker
tgt_means (array): shape (`num_mixtures`, `order of spectral feature`)
means of GMM for a target speaker
covarXX (array): shape (`num_mixtures`, `order of spectral feature`,
`order of spectral feature`)
variance matrix of source speaker's spectral feature
covarXY (array): shape (`num_mixtures`, `order of spectral feature`,
`order of spectral feature`)
covariance matrix of source and target speaker's spectral feature
covarYX (array): shape (`num_mixtures`, `order of spectral feature`,
`order of spectral feature`)
covariance matrix of target and source speaker's spectral feature
covarYY (array): shape (`num_mixtures`, `order of spectral feature`,
`order of spectral feature`) variance matrix of target speaker's
spectral feature
D (array): shape (`num_mixtures`, `order of spectral feature`,
`order of spectral feature`) covariance matrices of target static
spectral features
px (sklearn.mixture.GaussianMixture): Gaussian Mixture Models of source
speaker's features
Examples:
>>> from sklearn.mixture import GaussianMixture
>>> from nnmnkwii.baseline.gmm import MLPG
>>> import numpy as np
>>> static_dim, T = 24, 10
>>> windows = [
... (0, 0, np.array([1.0])),
... (1, 1, np.array([-0.5, 0.0, 0.5])),
... (1, 1, np.array([1.0, -2.0, 1.0])),
... ]
>>> src = np.random.rand(T, static_dim * len(windows))
>>> tgt = np.random.rand(T, static_dim * len(windows))
>>> XY = np.concatenate((src, tgt), axis=-1) # pseudo parallel data
>>> gmm = GaussianMixture(n_components=4)
>>> _ = gmm.fit(XY)
>>> paramgen = MLPG(gmm, windows=windows)
>>> generated = paramgen.transform(src)
>>> assert generated.shape == (T, static_dim)
See also:
:class:`nnmnkwii.preprocessing.alignment.IterativeDTWAligner`.
.. [1] [Toda 2007] Voice Conversion Based on Maximum Likelihood Estimation
of Spectral Parameter Trajectory.
"""
def __init__(self, gmm, windows=None, swap=False, diff=False):
super(MLPG, self).__init__(gmm, swap, diff)
if windows is None:
windows = [
(0, 0, np.array([1.0])),
(1, 1, np.array([-0.5, 0.0, 0.5])),
]
self.windows = windows
self.static_dim = gmm.means_.shape[-1] // 2 // len(windows)
def transform(self, src):
"""Mapping source feature x to target feature y so that maximize the
likelihood of y given x.
Args:
src (array): shape (`the number of frames`, `the order of spectral
feature`) a sequence of source speaker's spectral feature that
will be transformed.
Returns:
array: a sequence of transformed features
"""
T, feature_dim = src.shape[0], src.shape[1]
if feature_dim == self.static_dim:
return super(MLPG, self).transform(src)
# A suboptimum mixture sequence (eq.37)
optimum_mix = self.px.predict(src)
# Compute E eq.(40)
E = np.empty((T, feature_dim))
for t in range(T):
m = optimum_mix[t] # estimated mixture index at time t
xx = np.linalg.solve(self.covarXX[m], src[t] - self.src_means[m])
# Eq. (22)
E[t] = self.tgt_means[m] + np.dot(self.covarYX[m], xx)
# Compute D eq.(23)
# Approximated variances with diagonals so that we can do MLPG
# efficiently in dimention-wise manner
D = np.empty((T, feature_dim))
for t in range(T):
m = optimum_mix[t]
# Eq. (23), with approximating covariances as diagonals
D[t] = np.diag(self.covarYY[m]) - np.diag(self.covarYX[m]) / np.diag(
self.covarXX[m]
) * np.diag(self.covarXY[m])
# Once we have mean and variance over frames, then we can do MLPG
return mlpg(E, D, self.windows)