-
-
Notifications
You must be signed in to change notification settings - Fork 1k
/
Statistics.h
517 lines (461 loc) · 17 KB
/
Statistics.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* Written (W) 2011-2016 Heiko Strathmann
* Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
*/
#ifndef __STATISTICS_H_
#define __STATISTICS_H_
#include <shogun/lib/config.h>
#include <shogun/base/SGObject.h>
#include <shogun/lib/SGVector.h>
#include <shogun/io/SGIO.h>
namespace shogun
{
template<class T> class SGMatrix;
template<class T> class SGSparseMatrix;
/** @brief Class that contains certain functions related to statistics, such as
* probability/cumulative distribution functions, different statistics, etc.
*/
class CStatistics: public CSGObject
{
public:
/** Calculates mean of given values. Given \f$\{x_1, ..., x_m\}\f$, this
* is \f$\frac{1}{m}\sum_{i=1}^m x_i\f$
*
* @param vec vector of values
* @return mean of given values
*/
template<class T>
static floatmax_t mean(SGVector<T> vec)
{
floatmax_t sum = 0;
for ( index_t i = 0 ; i < vec.vlen ; ++i )
sum += vec[i];
return sum/vec.vlen;
}
/** Calculates median of given values. The median is the value that one
* gets when the input vector is sorted and then selects the middle value.
*
* @param values vector of values
* @param modify if false, array is modified while median is computed
* (Using QuickSelect).
* If true, median is computed without modifications, which is slower.
* There are two methods to choose from.
* @param in_place if set false, the vector is copied and then computed
* using QuickSelect. If set true, median is computed in-place using
* Torben method.
* @return median of given values
*/
static float64_t median(SGVector<float64_t> values, bool modify=false,
bool in_place=false);
/** Calculates median of given values. Matrix is seen as a long vector for
* this. The median is the value that one
* gets when the input vector is sorted and then selects the middle value.
*
* This method is just a wrapper for median(). See this method for license
* of QuickSelect and Torben.
*
* @param values vector of values
* @param modify if false, array is modified while median is computed
* (Using QuickSelect).
* If true, median is computed without modifications, which is slower.
* There are two methods to choose from.
* @param in_place if set false, the vector is copied and then computed
* using QuickSelect. If set true, median is computed in-place using
* Torben method.
* @return median of given values
*/
static float64_t matrix_median(SGMatrix<float64_t> values,
bool modify=false, bool in_place=false);
/** Calculates unbiased empirical variance estimator of given values. Given
* \f$\{x_1, ..., x_m\}\f$, this is
* \f$\frac{1}{m-1}\sum_{i=1}^m (x-\bar{x})^2\f$ where
* \f$\bar x=\frac{1}{m}\sum_{i=1}^m x_i\f$
*
* @param values vector of values
* @return variance of given values
*/
static float64_t variance(SGVector<float64_t> values);
/** Calculates unbiased empirical standard deviation estimator of given
* values. Given \f$\{x_1, ..., x_m\}\f$, this is
* \f$\sqrt{\frac{1}{m-1}\sum_{i=1}^m (x-\bar{x})^2}\f$ where
* \f$\bar x=\frac{1}{m}\sum_{i=1}^m x_i\f$
*
* @param values vector of values
* @return variance of given values
*/
static float64_t std_deviation(SGVector<float64_t> values);
/** Calculates mean of given values. Given \f$\{x_1, ..., x_m\}\f$, this
* is \f$\frac{1}{m}\sum_{i=1}^m x_i\f$
*
* Computes the mean for each row/col of matrix
*
* @param values vector of values
* @param col_wise if true, every column vector will be used, row vectors
* otherwise
* @return mean of given values
*/
static SGVector<float64_t> matrix_mean(SGMatrix<float64_t> values,
bool col_wise=true);
/** Calculates unbiased empirical variance estimator of given values. Given
* \f$\{x_1, ..., x_m\}\f$, this is
* \f$\frac{1}{m-1}\sum_{i=1}^m (x-\bar{x})^2\f$ where
* \f$\bar x=\frac{1}{m}\sum_{i=1}^m x_i\f$
*
* Computes the variance for each row/col of matrix
*
* @param values vector of values
* @param col_wise if true, every column vector will be used, row vectors
* otherwise
* @return variance of given values
*/
static SGVector<float64_t> matrix_variance(SGMatrix<float64_t> values,
bool col_wise=true);
/** Calculates unbiased empirical standard deviation estimator of given
* values. Given \f$\{x_1, ..., x_m\}\f$, this is
* \f$\sqrt{\frac{1}{m-1}\sum_{i=1}^m (x-\bar{x})^2}\f$ where
* \f$\bar x=\frac{1}{m}\sum_{i=1}^m x_i\f$
*
* Computes the variance for each row/col of matrix
*
* @param values vector of values
* @param col_wise if true, every column vector will be used, row vectors
* otherwise
* @return variance of given values
*/
static SGVector<float64_t> matrix_std_deviation(
SGMatrix<float64_t> values, bool col_wise=true);
/** Computes the empirical estimate of the covariance matrix of the given
* data which is organized as num_cols variables with num_rows observations.
* Normalizes by N-1 for N observations
*
* Data is centered before matrix is computed. May be done in place.
* In this case, the observation matrix is changed (centered).
*
* Given sample matrix \f$X\f$, first, column mean is removed to create
* \f$\bar X\f$. Then \f$\text{cov}(X)=(X-\bar X)^T(X - \bar X)\f$ is
* returned.
*
* @param observations data matrix organized as one variable per column
* @param in_place optional, if set to true, observations matrix will be
* centered, if false, a copy will be created an centered.
* @return covariance matrix empirical estimate
*/
static SGMatrix<float64_t> covariance_matrix(
SGMatrix<float64_t> observations, bool in_place=false);
/** Inverse of Normal cumulative distribution function
*
* Returns the argument, \f$x\f$, for which the area under the
* Gaussian probability density function (integrated from
* minus infinity to \f$x\f$) is equal to \f$y\f$.
*
* @param y0 Output of normal CDF for which parameter is returned.
* @param mean Mean of normal distribution.
* @param std_dev Standard deviation of normal distribution.
* @return Parameter that produces given output.
*/
static float64_t inverse_normal_cdf(float64_t y0);
/** Inverse of Normal cumulative distribution function.
*
* Same as ::inverse_normal_cdf with mean and standard deviation parameters.
*
* @param y0 Output of normal CDF for which parameter is returned.
* @param mean Mean of normal distribution.
* @param std_dev Standard deviation of normal distribution.
* @return Parameter that produces given output.
*/
static float64_t inverse_normal_cdf(float64_t y0, float64_t mean,
float64_t std_dev);
static float64_t rational_approximation(float64_t t);
/** @return natural logarithm of the gamma function of input */
static inline float64_t lgamma(float64_t x)
{
return ::lgamma((double) x);
}
/** @return natural logarithm of the gamma function of input for large
* numbers */
static inline floatmax_t lgammal(floatmax_t x)
{
#ifdef HAVE_LGAMMAL
return ::lgammal((long double) x);
#else
return ::lgamma((double) x);
#endif // HAVE_LGAMMAL
}
/** @return gamma function of input */
static inline float64_t tgamma(float64_t x)
{
return ::tgamma((double) x);
}
/** Incomplete gamma integral
*
* Given \f$p\f$, the function finds \f$x\f$ such that
*
* \f[
* \text{incomplete\_gamma}(a,x)=\frac{1}{\Gamma(a)}}\int_0^x e^{-t} t^{a-1} dt.
* \f]
*
*
* In this implementation both arguments must be positive.
* The integral is evaluated by either a power series or
* continued fraction expansion, depending on the relative
* values of \f$a\f$ and \f$x\f$.
*
* Taken from ALGLIB under gpl2+
*/
static float64_t incomplete_gamma(float64_t a, float64_t x);
/** Evaluates the CDF of the gamma distribution with given parameters \f$a, b\f$
* at \f$x\f$.
*
* @param x position to evaluate
* @param a shape parameter
* @param b scale parameter
* @return gamma CDF at \f$x\f$
*/
static float64_t gamma_cdf(float64_t x, float64_t a, float64_t b);
/** Evaluates the inverse CDF of the gamma distribution with given
* parameters \f$a\f$, \f$b\f$ at \f$x\f$, such that result equals
* \f$\text{gamma\_cdf}(x,a,b)\f$.
*
* @param p Position to evaluate
* @param a Shape parameter
* @param b Scale parameter
* @return \f$x\f$ such that result equals \f$\text{gamma\_cdf}(x,a,b)\f$.
*/
static float64_t inverse_gamma_cdf(float64_t p, float64_t a, float64_t b);
/** Functional inverse of Student's t distribution
*
* Given probability \f$p\f$, finds the argument \f$t\f$ such that
* \f$\text{student\_t}(k,t)=p\f$
*
* @param k Degrees of freedom.
* @param p Point to evaluate.
* @return Value of pdf at given point.
*/
static float64_t inverse_student_t(int32_t k, float64_t p);
/** Normal distribution function
*
* Returns the area under the Gaussian probability density
* function, integrated from minus infinity to \f$x\f$:
*
* \f[
* \text{normal\_cdf}(x)=\frac{1}{\sqrt{2\pi}} \int_{-\infty}^x
* \exp \left( -\frac{t^2}{2} \right) dt = \frac{1+\text{error\_function}(z) }{2}
* \f]
*
* where \f$ z = \frac{x}{\sqrt{2} \sigma}\f$ and \f$ \sigma \f$ is the standard
* deviation. Computation is via the functions \f$\text{error\_function}\f$
* and \f$\text{error\_function\_completement}\f$.
*
*/
static float64_t normal_cdf(float64_t x, float64_t std_dev=1);
/** Returns logarithm of the cumulative distribution function
* (CDF) of Gaussian distribution \f$N(0, 1)\f$:
*
* \f[
* \text{lnormal\_cdf}(x)=log\left(\frac{1}{2}+
* \frac{1}{2}\text{error\_function}(\frac{x}{\sqrt{2}})\right)
* \f]
*
* @param x Evaluate CDF here
* @return \f$log(\text{normal\_cdf}(x))\f$
*/
static float64_t lnormal_cdf(float64_t x);
/** Evaluates the CDF of the chi square distribution with
* parameter k at \f$x\f$.
*
* @param x position to evaluate
* @param k parameter
* @return chi square CDF at \f$x\f$
*/
static float64_t chi2_cdf(float64_t x, float64_t k);
/** Incomplete beta integral
*
* Returns incomplete beta integral of the arguments, evaluated
* from zero to \f$x\f$. The function is defined as
* \f[
* \frac{\Gamma(a+b)}{\Gamma(a)\Gamma(b)}\int_0^x t^{a-1} (1-t)^{b-1} dt.
* \f]
*
* The domain of definition is \f$0 \leq x \leq 1\f$.
* The integral from \f$x\f$ to \f$1\f$ may be obtained by the symmetry
* relation
*
* \f[
* 1-\text{incomplete\_beta}(a,b,x)=\text{incomplete\_beta}(b,a,1-x).
* \f]
*
* @param a Shape parameter
* @param b Shape parameter
* @param x Evaluation position
* @return Value of integral
*
*/
// static float64_t incomplete_beta(float64_t a, float64_t b, float64_t x);
/** Evaluates the CDF of the F-distribution with parameters
* \f$d1,d2\f$ at \f$x\f$. Based on Wikipedia definition.
*
* @param x position to evaluate
* @param d1 parameter 1
* @param d2 parameter 2
* @return F-distribution CDF at \f$x\f$
*/
// static float64_t fdistribution_cdf(float64_t x, float64_t d1, float64_t d2);
/** Use to estimates erfc(x) valid for -100 < x < -8
*
* @param x real value
* @return weighted sum
*/
static float64_t erfc8_weighted_sum(float64_t x);
/** @return mutual information of \f$p\f$ which is given in logspace
* where \f$p,q\f$ are given in logspace */
static float64_t mutual_info(float64_t* p1, float64_t* p2, int32_t len);
/** @return relative entropy \f$H(P||Q)\f$
* where \f$p,q\f$ are given in logspace */
static float64_t relative_entropy(
float64_t* p, float64_t* q, int32_t len);
/** @return entropy of \f$p\f$ which is given in logspace */
static float64_t entropy(float64_t* p, int32_t len);
/** fisher's test for multiple 2x3 tables
* @param tables
*/
static SGVector<float64_t> fishers_exact_test_for_multiple_2x3_tables(SGMatrix<float64_t> tables);
/** fisher's test for 2x3 table
* @param table
*/
static float64_t fishers_exact_test_for_2x3_table(SGMatrix<float64_t> table);
/** sample indices
* @param sample_size size of sample to pick
* @param N total number of indices
*/
static SGVector<int32_t> sample_indices(int32_t sample_size, int32_t N);
/** @return object name */
virtual const char* get_name() const
{
return "Statistics";
}
/** Derivative of the log gamma function.
*
* @param x input
* @return derivative of the log gamma input
*/
static float64_t dlgamma(float64_t x);
/** Representation of a Sigmoid function for the fit_sigmoid function */
struct SigmoidParamters
{
/** parameter a */
float64_t a;
/** parameter b */
float64_t b;
};
/** Converts a given vector of scores to calibrated probabilities by fitting a
* sigmoid function using the method described in
* Lin, H., Lin, C., and Weng, R. (2007).
* A note on Platt's probabilistic outputs for support vector machines.
*
* This can be used to transform scores to probabilities as setting
* \f$pf=x*a+b\f$ for a given score \f$x\f$ and computing
* \f$\frac{\exp(-f)}{1+}exp(-f)}\f$ if \f$f\geq 0\f$ and
* \f$\frac{1}{(1+\exp(f)}\f$ otherwise
*
* @param scores scores to fit the sigmoid to
* @return struct containing the sigmoid's shape parameters a and b
*/
static SigmoidParamters fit_sigmoid(SGVector<float64_t> scores);
/** The log determinant of a dense matrix
*
* If determinant of the input matrix is positive, it returns the logarithm of the value.
* If not, it returns CMath::INFTY
* Note that the input matrix is not required to be symmetric positive definite.
* This method is slower than log_det() if input matrix is known to be symmetric positive definite
*
* It is adapted from Gaussian Process Machine Learning Toolbox
* http://www.gaussianprocess.org/gpml/code/matlab/doc/
*
* @param A input matrix
* @return the log determinant value
*/
static float64_t log_det_general(const SGMatrix<float64_t> A);
/** The log determinant of a dense matrix
*
* The log determinant of a positive definite symmetric real valued
* matrix is calculated as
* \f[
* \text{log\_determinant}(M)
* = \text{log}(\text{determinant}(L)\times\text{determinant}(L'))
* = 2\times \sum_{i}\text{log}(L_{i,i})
* \f]
* Where, \f$M = L\times L'\f$ as per Cholesky decomposition.
*
* @param m input matrix
* @return the log determinant value
*/
static float64_t log_det(SGMatrix<float64_t> m);
/** The log determinant of a sparse matrix
*
* The log determinant of symmetric positive definite sparse matrix
* is calculated in a similar way as the dense case. But using
* cholesky decomposition on sparse matrices may suffer from fill-in
* phenomenon, i.e. the factors may not be as sparse. The
* SimplicialCholesky module for sparse matrix in eigen3 library
* uses an approach called approximate minimum degree reordering,
* or amd, which permutes the matrix beforehand and results in much
* sparser factors. If \f$P\f$ is the permutation matrix, it computes
* \f$\text{LLT}(P\times M\times P^{-1}) = L\times L'\f$.
*
* @param m input sparse matrix
* @return the log determinant value
*/
static float64_t log_det(const SGSparseMatrix<float64_t> m);
/** Sampling from a multivariate Gaussian distribution with
* dense covariance matrix
*
* Sampling is performed by taking samples from \f$N(0, I)\f$, then
* using cholesky factor of the covariance matrix, \f$\Sigma\f$ and
* performing
* \f[S_{N(\mu,\Sigma)}=S_{N(0,I)}*L^{T}+\mu\f]
* where \f$\Sigma=L*L^{T}\f$ and \f$\mu\f$ is the mean vector.
*
* @param mean the mean vector
* @param cov the covariance matrix
* @param N number of samples
* @param precision_matrix if true, sample from N(mu,C^-1)
* @return the sample matrix of size \f$N\times dim\f$
*/
static SGMatrix<float64_t> sample_from_gaussian(SGVector<float64_t> mean,
SGMatrix<float64_t> cov, int32_t N=1, bool precision_matrix=false);
/** Sampling from a multivariate Gaussian distribution with
* sparse covariance matrix
*
* Sampling is performed in similar way as of dense covariance
* matrix, but direct cholesky factorization of sparse matrices
* could be inefficient. So, this method uses permutation matrix
* for factorization and then permutes back the final samples
* before adding the mean.
*
* @param mean the mean vector
* @param cov the covariance matrix
* @param N number of samples
* @param precision_matrix if true, sample from N(mu,C^-1)
* @return the sample matrix of size \f$N\times dim\f$
*/
static SGMatrix<float64_t> sample_from_gaussian(SGVector<float64_t> mean,
SGSparseMatrix<float64_t> cov, int32_t N=1, bool precision_matrix=false);
/** Magic number for computing lnormal_cdf */
static const float64_t ERFC_CASE1;
/** Magic number for computing lnormal_cdf */
static const float64_t ERFC_CASE2;
};
/// mean not implemented for complex128_t, returns 0.0 instead
template <>
inline floatmax_t CStatistics::mean<complex128_t>(SGVector<complex128_t> vec)
{
SG_SNOTIMPLEMENTED
return floatmax_t(0.0);
}
}
#endif /* __STATISTICS_H_ */