/
_principal_coordinate_analysis.py
405 lines (339 loc) · 16.4 KB
/
_principal_coordinate_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# ----------------------------------------------------------------------------
import numpy as np
import pandas as pd
from numpy import dot, hstack
from numpy.linalg import qr, svd
from numpy.random import standard_normal
from scipy.linalg import eigh
from warnings import warn
from skbio.stats.distance import DistanceMatrix
from ._ordination_results import OrdinationResults
from ._utils import center_distance_matrix, scale
def pcoa(distance_matrix, method="eigh", number_of_dimensions=0, inplace=False):
r"""Perform Principal Coordinate Analysis.
Principal Coordinate Analysis (PCoA) is a method similar
to Principal Components Analysis (PCA) with the difference that PCoA
operates on distance matrices, typically with non-euclidian and thus
ecologically meaningful distances like UniFrac in microbiome research.
In ecology, the euclidean distance preserved by Principal
Component Analysis (PCA) is often not a good choice because it
deals poorly with double zeros (Species have unimodal
distributions along environmental gradients, so if a species is
absent from two sites at the same site, it can't be known if an
environmental variable is too high in one of them and too low in
the other, or too low in both, etc. On the other hand, if an
species is present in two sites, that means that the sites are
similar.).
Note that the returned eigenvectors are not normalized to unit length.
Parameters
----------
distance_matrix : DistanceMatrix
A distance matrix.
method : str, optional
Eigendecomposition method to use in performing PCoA.
By default, uses SciPy's `eigh`, which computes exact
eigenvectors and eigenvalues for all dimensions. The alternate
method, `fsvd`, uses faster heuristic eigendecomposition but loses
accuracy. The magnitude of accuracy lost is dependent on dataset.
number_of_dimensions : int, optional
Dimensions to reduce the distance matrix to. This number determines
how many eigenvectors and eigenvalues will be returned.
By default, equal to the number of dimensions of the distance matrix,
as default eigendecomposition using SciPy's `eigh` method computes
all eigenvectors and eigenvalues. If using fast heuristic
eigendecomposition through `fsvd`, a desired number of dimensions
should be specified. Note that the default eigendecomposition
method `eigh` does not natively support a specifying number of
dimensions to reduce a matrix to, so if this parameter is specified,
all eigenvectors and eigenvalues will be simply be computed with no
speed gain, and only the number specified by `number_of_dimensions`
will be returned. Specifying a value of `0`, the default, will
set `number_of_dimensions` equal to the number of dimensions of the
specified `distance_matrix`.
inplace : bool, optional
If true, centers a distance matrix in-place in a manner that reduces
memory consumption.
Returns
-------
OrdinationResults
Object that stores the PCoA results, including eigenvalues, the
proportion explained by each of them, and transformed sample
coordinates.
See Also
--------
OrdinationResults
Notes
-----
.. note:: If the distance is not euclidean (for example if it is a
semimetric and the triangle inequality doesn't hold),
negative eigenvalues can appear. There are different ways
to deal with that problem (see Legendre & Legendre 1998, \S
9.2.3), but none are currently implemented here.
However, a warning is raised whenever negative eigenvalues
appear, allowing the user to decide if they can be safely
ignored.
"""
distance_matrix = DistanceMatrix(distance_matrix)
# Center distance matrix, a requirement for PCoA here
matrix_data = center_distance_matrix(distance_matrix.data, inplace=inplace)
# If no dimension specified, by default will compute all eigenvectors
# and eigenvalues
if number_of_dimensions == 0:
if method == "fsvd" and matrix_data.shape[0] > 10:
warn(
"FSVD: since no value for number_of_dimensions is specified, "
"PCoA for all dimensions will be computed, which may "
"result in long computation time if the original "
"distance matrix is large.",
RuntimeWarning,
)
# distance_matrix is guaranteed to be square
number_of_dimensions = matrix_data.shape[0]
elif number_of_dimensions < 0:
raise ValueError(
"Invalid operation: cannot reduce distance matrix "
"to negative dimensions using PCoA. Did you intend "
'to specify the default value "0", which sets '
"the number_of_dimensions equal to the "
"dimensionality of the given distance matrix?"
)
# Perform eigendecomposition
if method == "eigh":
# eigh does not natively support specifying number_of_dimensions, i.e.
# there are no speed gains unlike in FSVD. Later, we slice off unwanted
# dimensions to conform the result of eigh to the specified
# number_of_dimensions.
eigvals, eigvecs = eigh(matrix_data)
long_method_name = "Principal Coordinate Analysis"
elif method == "fsvd":
eigvals, eigvecs = _fsvd(matrix_data, number_of_dimensions)
long_method_name = "Approximate Principal Coordinate Analysis " "using FSVD"
else:
raise ValueError(
"PCoA eigendecomposition method {} not supported.".format(method)
)
# cogent makes eigenvalues positive by taking the
# abs value, but that doesn't seem to be an approach accepted
# by L&L to deal with negative eigenvalues. We raise a warning
# in that case. First, we make values close to 0 equal to 0.
negative_close_to_zero = np.isclose(eigvals, 0)
eigvals[negative_close_to_zero] = 0
if np.any(eigvals < 0):
warn(
"The result contains negative eigenvalues."
" Please compare their magnitude with the magnitude of some"
" of the largest positive eigenvalues. If the negative ones"
" are smaller, it's probably safe to ignore them, but if they"
" are large in magnitude, the results won't be useful. See the"
" Notes section for more details. The smallest eigenvalue is"
" {0} and the largest is {1}.".format(eigvals.min(), eigvals.max()),
RuntimeWarning,
)
# eigvals might not be ordered, so we first sort them, then analogously
# sort the eigenvectors by the ordering of the eigenvalues too
idxs_descending = eigvals.argsort()[::-1]
eigvals = eigvals[idxs_descending]
eigvecs = eigvecs[:, idxs_descending]
# If we return only the coordinates that make sense (i.e., that have a
# corresponding positive eigenvalue), then Jackknifed Beta Diversity
# won't work as it expects all the OrdinationResults to have the same
# number of coordinates. In order to solve this issue, we return the
# coordinates that have a negative eigenvalue as 0
num_positive = (eigvals >= 0).sum()
eigvecs[:, num_positive:] = np.zeros(eigvecs[:, num_positive:].shape)
eigvals[num_positive:] = np.zeros(eigvals[num_positive:].shape)
if method == "fsvd":
# Since the dimension parameter, hereafter referred to as 'd',
# restricts the number of eigenvalues and eigenvectors that FSVD
# computes, we need to use an alternative method to compute the sum
# of all eigenvalues, used to compute the array of proportions
# explained. Otherwise, the proportions calculated will only be
# relative to d number of dimensions computed; whereas we want
# it to be relative to the entire dimensionality of the
# centered distance matrix.
# An alternative method of calculating th sum of eigenvalues is by
# computing the trace of the centered distance matrix.
# See proof outlined here: https://goo.gl/VAYiXx
sum_eigenvalues = np.trace(matrix_data)
else:
# Calculate proportions the usual way
sum_eigenvalues = np.sum(eigvals)
proportion_explained = eigvals / sum_eigenvalues
# In case eigh is used, eigh computes all eigenvectors and -values.
# So if number_of_dimensions was specified, we manually need to ensure
# only the requested number of dimensions
# (number of eigenvectors and eigenvalues, respectively) are returned.
eigvecs = eigvecs[:, :number_of_dimensions]
eigvals = eigvals[:number_of_dimensions]
proportion_explained = proportion_explained[:number_of_dimensions]
# Scale eigenvalues to have length = sqrt(eigenvalue). This
# works because np.linalg.eigh returns normalized
# eigenvectors. Each row contains the coordinates of the
# objects in the space of principal coordinates. Note that at
# least one eigenvalue is zero because only n-1 axes are
# needed to represent n points in a euclidean space.
coordinates = eigvecs * np.sqrt(eigvals)
axis_labels = ["PC%d" % i for i in range(1, number_of_dimensions + 1)]
return OrdinationResults(
short_method_name="PCoA",
long_method_name=long_method_name,
eigvals=pd.Series(eigvals, index=axis_labels),
samples=pd.DataFrame(
coordinates, index=distance_matrix.ids, columns=axis_labels
),
proportion_explained=pd.Series(proportion_explained, index=axis_labels),
)
def _fsvd(centered_distance_matrix, number_of_dimensions=10):
"""Perform singular value decomposition.
More specifically in this case eigendecomposition, using fast heuristic algorithm
nicknamed "FSVD" (FastSVD), adapted and optimized from the algorithm described
by Halko et al (2011).
Parameters
----------
centered_distance_matrix : np.array
Numpy matrix representing the distance matrix for which the
eigenvectors and eigenvalues shall be computed
number_of_dimensions : int
Number of dimensions to keep. Must be lower than or equal to the
rank of the given distance_matrix.
Returns
-------
np.array
Array of eigenvectors, each with number_of_dimensions length.
np.array
Array of eigenvalues, a total number of number_of_dimensions.
Notes
-----
The algorithm is based on 'An Algorithm for the Principal
Component analysis of Large Data Sets'
by N. Halko, P.G. Martinsson, Y. Shkolnisky, and M. Tygert.
Original Paper: https://arxiv.org/abs/1007.5510
Ported from MATLAB implementation described here:
https://stats.stackexchange.com/a/11934/211065
"""
m, n = centered_distance_matrix.shape
# Number of levels of the Krylov method to use.
# For most applications, num_levels=1 or num_levels=2 is sufficient.
num_levels = 1
# Changes the power of the spectral norm, thus minimizing the error).
use_power_method = False
# Note: a (conjugate) transpose is removed for performance, since we
# only expect square matrices.
if m != n:
raise ValueError("FSVD expects square distance matrix")
if number_of_dimensions > m or number_of_dimensions > n:
raise ValueError(
"FSVD: number_of_dimensions cannot be larger than"
" the dimensionality of the given distance matrix."
)
if number_of_dimensions < 0:
raise ValueError(
"Invalid operation: cannot reduce distance matrix "
"to negative dimensions using PCoA. Did you intend "
'to specify the default value "0", which sets '
"the number_of_dimensions equal to the "
"dimensionality of the given distance matrix?"
)
k = number_of_dimensions + 2
# Form a real nxl matrix G whose entries are independent, identically
# distributed Gaussian random variables of zero mean and unit variance
G = standard_normal(size=(n, k))
if use_power_method:
# use only the given exponent
H = dot(centered_distance_matrix, G)
for x in range(2, num_levels + 2):
# enhance decay of singular values
# note: distance_matrix is no longer transposed, saves work
# since we're expecting symmetric, square matrices anyway
# (Daniel McDonald's changes)
H = dot(centered_distance_matrix, dot(centered_distance_matrix, H))
else:
# compute the m x l matrices H^{(0)}, ..., H^{(i)}
# Note that this is done implicitly in each iteration below.
H = dot(centered_distance_matrix, G)
# to enhance performance
H = hstack((H, dot(centered_distance_matrix, dot(centered_distance_matrix, H))))
for x in range(3, num_levels + 2):
tmp = dot(centered_distance_matrix, dot(centered_distance_matrix, H))
H = hstack(
(H, dot(centered_distance_matrix, dot(centered_distance_matrix, tmp)))
)
# Using the pivoted QR-decomposition, form a real m * ((i+1)l) matrix Q
# whose columns are orthonormal, s.t. there exists a real
# ((i+1)l) * ((i+1)l) matrix R for which H = QR
Q, R = qr(H)
# Compute the n * ((i+1)l) product matrix T = A^T Q
T = dot(centered_distance_matrix, Q) # step 3
# Form an SVD of T
Vt, St, W = svd(T, full_matrices=False)
W = W.transpose()
# Compute the m * ((i+1)l) product matrix
Ut = dot(Q, W)
U_fsvd = Ut[:, :number_of_dimensions]
S = St[:number_of_dimensions]
# drop imaginary component, if we got one
# Note:
# In cogent, after computing eigenvalues/vectors, the imaginary part
# is dropped, if any. We know for a fact that the eigenvalues are
# real, so that's not necessary, but eigenvectors can in principle
# be complex (see for example
# http://math.stackexchange.com/a/47807/109129 for details)
eigenvalues = S.real
eigenvectors = U_fsvd.real
return eigenvalues, eigenvectors
def pcoa_biplot(ordination, y):
"""Compute the projection of descriptors into a PCoA matrix.
This implementation is as described in Chapter 9 of Legendre & Legendre,
Numerical Ecology 3rd edition.
Parameters
----------
ordination: OrdinationResults
The computed principal coordinates analysis of dimensions (n, c) where
the matrix ``y`` will be projected onto.
y: DataFrame
Samples by features table of dimensions (n, m). These can be
environmental features or abundance counts. This table should be
normalized in cases of dimensionally heterogenous physical variables.
Returns
-------
OrdinationResults
The modified input object that includes projected features onto the
ordination space in the ``features`` attribute.
"""
# acknowledge that most saved ordinations lack a name, however if they have
# a name, it should be PCoA
if ordination.short_method_name != "" and ordination.short_method_name != "PCoA":
raise ValueError(
"This biplot computation can only be performed in a " "PCoA matrix."
)
if set(y.index) != set(ordination.samples.index):
raise ValueError(
"The eigenvectors and the descriptors must describe " "the same samples."
)
eigvals = ordination.eigvals.values
coordinates = ordination.samples
N = coordinates.shape[0]
# align the descriptors and eigenvectors in a sample-wise fashion
y = y.reindex(coordinates.index)
# S_pc from equation 9.44
# Represents the covariance matrix between the features matrix and the
# column-centered eigenvectors of the pcoa.
spc = (1 / (N - 1)) * y.values.T.dot(scale(coordinates, ddof=1))
# U_proj from equation 9.55, is the matrix of descriptors to be projected.
#
# Only get the power of non-zero values, otherwise this will raise a
# divide by zero warning. There shouldn't be negative eigenvalues(?)
Uproj = np.sqrt(N - 1) * spc.dot(
np.diag(np.power(eigvals, -0.5, where=eigvals > 0))
)
ordination.features = pd.DataFrame(
data=Uproj, index=y.columns.copy(), columns=coordinates.columns.copy()
)
ordination.features.fillna(0.0, inplace=True)
return ordination