-
Notifications
You must be signed in to change notification settings - Fork 89
/
ksamp.py
139 lines (118 loc) · 5.23 KB
/
ksamp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import numpy as np
from numba import njit
from .._utils import check_inputs_distmat, euclidean
from .base import KSampleTest
from ..independence import CCA, Dcorr, HHG, RV, Hsic
from ._utils import _CheckInputs, k_sample_transform
class KSample(KSampleTest):
r"""
Class for calculating the *k*-sample test statistic and p-value.
A *k*-sample test tests equality in distribution among groups. Groups
can be of different sizes, but generally have the same dimensionality.
There are not many non-parametric *k*-sample tests, but this version
cleverly leverages the power of some of the implemented independence
tests to test this equality of distribution.
Parameters
----------
indep_test : {"CCA", "Dcorr", "HHG", "RV", "Hsic"}
A string corresponding to the desired independence test from
``mgc.independence``.
compute_distance : callable(), optional (default: euclidean)
A function that computes the distance among the samples within each
data matrix. Set to `None` if `x` and `y` are already distance
matrices. To call a custom function, either create the distance matrix
before-hand or create a function of the form ``compute_distance(x)``
where `x` is the data matrix for which pairwise distances are
calculated.
Notes
-----
The ideas behind this can be found in an upcoming paper:
The *k*-sample testing problem can be thought of as a generalization of
the two sample testing problem. Define
:math:`\{ u_i \stackrel{iid}{\sim} F_U,\ i = 1, ..., n \}` and
:math:`\{ v_j \stackrel{iid}{\sim} F_V,\ j = 1, ..., m \}` as two groups
of samples deriving from different distributions with the same
dimensionality. Then, problem that we are testing is thus,
.. math::
H_0: F_U &= F_V \\
H_A: F_U &\neq F_V
The closely related independence testing problem can be generalized
similarly: Given a set of paired data
:math:`\{\left(x_i, y_i \right) \stackrel{iid}{\sim} F_{XY},
\ i = 1, ..., N\}`, the problem that we are testing is,
.. math::
H_0: F_{XY} &= F_X F_Y \\
H_A: F_{XY} &\neq F_X F_Y
By manipulating the inputs of the *k*-sample test, we can create
concatenated versions of the inputs and another label matrix which are
necessarily paired. Then, any nonparametric test can be performed on
this data.
"""
def __init__(self, indep_test, compute_distance=euclidean):
test_names = {
RV.__name__: RV,
CCA.__name__: CCA,
HHG.__name__: HHG,
Hsic.__name__: Hsic,
Dcorr.__name__: Dcorr,
}
if indep_test not in test_names.keys():
raise ValueError("Test is not a valid independence test")
indep_test = test_names[indep_test]
KSampleTest.__init__(self, indep_test, compute_distance=compute_distance)
def test(self, *args, reps=1000, workers=1, random_state=None):
r"""
Calculates the *k*-sample test statistic and p-value.
Parameters
----------
*args : ndarrays
Variable length input data matrices. All inputs must have the same
number of samples. That is, the shapes must be `(n, p)` and
`(m, p)` where `n` and `m` are the number of samples and `p` are
the number of dimensions. Alternatively, inputs can be distance
matrices, where the shapes must all be `(n, n)`.
reps : int, optional (default: 1000)
The number of replications used to estimate the null distribution
when using the permutation test used to calculate the p-value.
workers : int, optional (default: 1)
The number of cores to parallelize the p-value computation over.
Supply -1 to use all cores available to the Process.
random_state : int or np.random.RandomState instance, optional
If already a RandomState instance, use it.
If seed is an int, return a new RandomState instance seeded with seed.
If None, use np.random.RandomState. Default is None.
Returns
-------
stat : float
The computed *k*-Sample statistic.
pvalue : float
The computed *k*-Sample p-value.
Examples
--------
>>> import numpy as np
>>> from mgc.ksample import KSample
>>> x = np.arange(7)
>>> y = x
>>> z = np.arange(10)
>>> stat, pvalue = KSample("Dcorr").test(x, y)
>>> '%.3f, %.1f' % (stat, pvalue)
'-0.136, 1.0'
The number of replications can give p-values with higher confidence
(greater alpha levels).
>>> import numpy as np
>>> from mgc.ksample import KSample
>>> x = np.arange(7)
>>> y = x
>>> z = np.ones(7)
>>> stat, pvalue = KSample("Dcorr").test(x, y, z, reps=10000)
>>> '%.3f, %.1f' % (stat, pvalue)
'0.224, 0.0'
"""
inputs = list(args)
check_input = _CheckInputs(
inputs=inputs,
indep_test=self.indep_test,
compute_distance=self.compute_distance,
)
inputs = check_input()
return super(KSample, self).test(inputs, reps, workers, random_state)