/
_entropy.py
340 lines (274 loc) · 11.4 KB
/
_entropy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# -*- coding: utf-8 -*-
"""
Created on Fri Apr 2 09:06:05 2021
@author: matth
"""
from __future__ import annotations
import math
import numpy as np
from scipy import special
from typing import Optional, Union
__all__ = ['entropy', 'differential_entropy']
def entropy(pk: np.typing.ArrayLike,
qk: Optional[np.typing.ArrayLike] = None,
base: Optional[float] = None,
axis: int = 0
) -> Union[np.number, np.ndarray]:
"""Calculate the entropy of a distribution for given probability values.
If only probabilities `pk` are given, the entropy is calculated as
``S = -sum(pk * log(pk), axis=axis)``.
If `qk` is not None, then compute the Kullback-Leibler divergence
``S = sum(pk * log(pk / qk), axis=axis)``.
This routine will normalize `pk` and `qk` if they don't sum to 1.
Parameters
----------
pk : array_like
Defines the (discrete) distribution. Along each axis-slice of ``pk``,
element ``i`` is the (possibly unnormalized) probability of event
``i``.
qk : array_like, optional
Sequence against which the relative entropy is computed. Should be in
the same format as `pk`.
base : float, optional
The logarithmic base to use, defaults to ``e`` (natural logarithm).
axis : int, optional
The axis along which the entropy is calculated. Default is 0.
Returns
-------
S : {float, array_like}
The calculated entropy.
Examples
--------
>>> from scipy.stats import entropy
Bernoulli trial with different p.
The outcome of a fair coin is the most uncertain:
>>> entropy([1/2, 1/2], base=2)
1.0
The outcome of a biased coin is less uncertain:
>>> entropy([9/10, 1/10], base=2)
0.46899559358928117
Relative entropy:
>>> entropy([1/2, 1/2], qk=[9/10, 1/10])
0.5108256237659907
"""
if base is not None and base <= 0:
raise ValueError("`base` must be a positive number or `None`.")
pk = np.asarray(pk)
pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
if qk is None:
vec = special.entr(pk)
else:
qk = np.asarray(qk)
pk, qk = np.broadcast_arrays(pk, qk)
qk = 1.0*qk / np.sum(qk, axis=axis, keepdims=True)
vec = special.rel_entr(pk, qk)
S = np.sum(vec, axis=axis)
if base is not None:
S /= np.log(base)
return S
def differential_entropy(
values: np.typing.ArrayLike,
*,
window_length: Optional[int] = None,
base: Optional[float] = None,
axis: int = 0,
method: str = "auto",
) -> Union[np.number, np.ndarray]:
r"""Given a sample of a distribution, estimate the differential entropy.
Several estimation methods are available using the `method` parameter. By
default, a method is selected based the size of the sample.
Parameters
----------
values : sequence
Sample from a continuous distribution.
window_length : int, optional
Window length for computing Vasicek estimate. Must be an integer
between 1 and half of the sample size. If ``None`` (the default), it
uses the heuristic value
.. math::
\left \lfloor \sqrt{n} + 0.5 \right \rfloor
where :math:`n` is the sample size. This heuristic was originally
proposed in [2]_ and has become common in the literature.
base : float, optional
The logarithmic base to use, defaults to ``e`` (natural logarithm).
axis : int, optional
The axis along which the differential entropy is calculated.
Default is 0.
method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
The method used to estimate the differential entropy from the sample.
Default is ``'auto'``. See Notes for more information.
Returns
-------
entropy : float
The calculated differential entropy.
Notes
-----
This function will converge to the true differential entropy in the limit
.. math::
n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
The optimal choice of ``window_length`` for a given sample size depends on
the (unknown) distribution. Typically, the smoother the density of the
distribution, the larger the optimal value of ``window_length`` [1]_.
The following options are available for the `method` parameter.
* ``'vasicek'`` uses the estimator presented in [1]_. This is
one of the first and most influential estimators of differential entropy.
* ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
is not only consistent but, under some conditions, asymptotically normal.
* ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
in simulation to have smaller bias and mean squared error than
the Vasicek estimator.
* ``'correa'`` uses the estimator presented in [5]_ based on local linear
regression. In a simulation study, it had consistently smaller mean
square error than the Vasiceck estimator, but it is more expensive to
compute.
* ``'auto'`` selects the method automatically (default). Currently,
this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
samples, but this behavior is subject to change in future versions.
All estimators are implemented as described in [6]_.
References
----------
.. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
Journal of the Royal Statistical Society:
Series B (Methodological), 38(1), 54-59.
.. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
goodness-of-fit test for exponentiality. Communications in
Statistics-Theory and Methods, 28(5), 1183-1202.
.. [3] Van Es, B. (1992). Estimating functionals related to a density by a
class of statistics based on spacings. Scandinavian Journal of
Statistics, 61-72.
.. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
.. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
in Statistics-Theory and Methods, 24(10), 2439-2449.
.. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
Annals of Data Science, 2(2), 231-241.
https://link.springer.com/article/10.1007/s40745-015-0045-9
Examples
--------
>>> from scipy.stats import differential_entropy, norm
Entropy of a standard normal distribution:
>>> rng = np.random.default_rng()
>>> values = rng.standard_normal(100)
>>> differential_entropy(values)
1.3407817436640392
Compare with the true entropy:
>>> float(norm.entropy())
1.4189385332046727
For several sample sizes between 5 and 1000, compare the accuracy of
the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
compare the root mean squared error (over 1000 trials) between the estimate
and the true differential entropy of the distribution.
>>> from scipy import stats
>>> import matplotlib.pyplot as plt
>>>
>>>
>>> def rmse(res, expected):
... '''Root mean squared error'''
... return np.sqrt(np.mean((res - expected)**2))
>>>
>>>
>>> a, b = np.log10(5), np.log10(1000)
>>> ns = np.round(np.logspace(a, b, 10)).astype(int)
>>> reps = 1000 # number of repetitions for each sample size
>>> expected = stats.expon.entropy()
>>>
>>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
>>> for method in method_errors:
... for n in ns:
... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
... res = stats.differential_entropy(rvs, method=method, axis=-1)
... error = rmse(res, expected)
... method_errors[method].append(error)
>>>
>>> for method, errors in method_errors.items():
... plt.loglog(ns, errors, label=method)
>>>
>>> plt.legend()
>>> plt.xlabel('sample size')
>>> plt.ylabel('RMSE (1000 trials)')
>>> plt.title('Entropy Estimator Error (Exponential Distribution)')
"""
values = np.asarray(values)
values = np.moveaxis(values, axis, -1)
n = values.shape[-1] # number of observations
if window_length is None:
window_length = math.floor(math.sqrt(n) + 0.5)
if not 2 <= 2 * window_length < n:
raise ValueError(
f"Window length ({window_length}) must be positive and less "
f"than half the sample size ({n}).",
)
if base is not None and base <= 0:
raise ValueError("`base` must be a positive number or `None`.")
sorted_data = np.sort(values, axis=-1)
methods = {"vasicek": _vasicek_entropy,
"van es": _van_es_entropy,
"correa": _correa_entropy,
"ebrahimi": _ebrahimi_entropy,
"auto": _vasicek_entropy}
method = method.lower()
if method not in methods:
message = f"`method` must be one of {set(methods)}"
raise ValueError(message)
if method == "auto":
if n <= 10:
method = 'van es'
elif n <= 1000:
method = 'ebrahimi'
else:
method = 'vasicek'
res = methods[method](sorted_data, window_length)
if base is not None:
res /= np.log(base)
return res
def _pad_along_last_axis(X, m):
"""Pad the data for computing the rolling window difference."""
# scales a bit better than method in _vasicek_like_entropy
shape = np.array(X.shape)
shape[-1] = m
Xl = np.broadcast_to(X[..., [0]], shape) # [0] vs 0 to maintain shape
Xr = np.broadcast_to(X[..., [-1]], shape)
return np.concatenate((Xl, X, Xr), axis=-1)
def _vasicek_entropy(X, m):
"""Compute the Vasicek estimator as described in [6] Eq. 1.3."""
n = X.shape[-1]
X = _pad_along_last_axis(X, m)
differences = X[..., 2 * m:] - X[..., : -2 * m:]
logs = np.log(n/(2*m) * differences)
return np.mean(logs, axis=-1)
def _van_es_entropy(X, m):
"""Compute the van Es estimator as described in [6]."""
# No equation number, but referred to as HVE_mn.
# Typo: there should be a log within the summation.
n = X.shape[-1]
difference = X[..., m:] - X[..., :-m]
term1 = 1/(n-m) * np.sum(np.log((n+1)/m * difference), axis=-1)
k = np.arange(m, n+1)
return term1 + np.sum(1/k) + np.log(m) - np.log(n+1)
def _ebrahimi_entropy(X, m):
"""Compute the Ebrahimi estimator as described in [6]."""
# No equation number, but referred to as HE_mn
n = X.shape[-1]
X = _pad_along_last_axis(X, m)
differences = X[..., 2 * m:] - X[..., : -2 * m:]
i = np.arange(1, n+1).astype(float)
ci = np.ones_like(i)*2
ci[i <= m] = 1 + (i[i <= m] - 1)/m
ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m
logs = np.log(n * differences / (ci * m))
return np.mean(logs, axis=-1)
def _correa_entropy(X, m):
"""Compute the Correa estimator as described in [6]."""
# No equation number, but referred to as HC_mn
n = X.shape[-1]
X = _pad_along_last_axis(X, m)
i = np.arange(1, n+1)
dj = np.arange(-m, m+1)[:, None]
j = i + dj
j0 = j + m - 1 # 0-indexed version of j
Xibar = np.mean(X[..., j0], axis=-2, keepdims=True)
difference = X[..., j0] - Xibar
num = np.sum(difference*dj, axis=-2) # dj is d-i
den = n*np.sum(difference**2, axis=-2)
return -np.mean(np.log(num/den), axis=-1)