-
-
Notifications
You must be signed in to change notification settings - Fork 1k
/
PCA.h
228 lines (200 loc) · 8 KB
/
PCA.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
/*
* This software is distributed under BSD 3-clause license (see LICENSE file).
*
* Authors: Soeren Sonnenburg, Sergey Lisitsyn, Heiko Strathmann,
* Evgeniy Andreev, Marc Zimmermann, Viktor Gal, Evan Shelhamer,
* Bjoern Esser, Sanuj Sharma
*/
#ifndef PCA_H_
#define PCA_H_
#include <shogun/lib/config.h>
#include <shogun/preprocessor/DimensionReductionPreprocessor.h>
#include <shogun/features/Features.h>
#include <shogun/lib/common.h>
namespace shogun
{
/** Matrix decomposition method for PCA */
enum EPCAMethod
{
/** if N>D then EVD is chosen automatically else SVD is chosen
* (D-dimensions N-number of vectors)
*/
AUTO = 10,
/** SVD based PCA. Time complexity ~14dn^2 (d-dimensions n-number of vectors) */
SVD = 20,
/** Eigenvalue decomposition of covariance matrix.
* Time complexity ~10d^3 (d-dimensions n-number of vectors)
*/
EVD = 30
};
/** mode of pca */
enum EPCAMode
{
/** cut by threshold */
THRESHOLD,
/** variance explained */
VARIANCE_EXPLAINED,
/** keep fixed number of features */
FIXED_NUMBER
};
/** memory usage by PCA : In-place or through reallocation */
enum EPCAMemoryMode
{
/** The feature matrix replaced by new matrix with target dims.
* This requires memory for old matrix as well as new matrix
* at once for a short amount of time initially.
*/
MEM_REALLOCATE,
/** The feature matrix is modified in-place to generate the new matrix.
* Output matrix dimensions are changed to target dims, but actual matrix
* size remains same internally. Modifies initial data matrix
*/
MEM_IN_PLACE
};
/** @brief Preprocessor PCA performs principial component analysis on input
* feature vectors/matrices. When the init method in PCA is called with proper
* feature matrix X (with say N number of vectors and D feature dimension), a
* transformation matrix is computed and stored internally. This transformation
* matrix is then used to transform all D-dimensional feature vectors or feature
* matrices (with D feature dimensions) supplied via apply_to_feature_matrix or
* apply_to_feature_vector methods. This tranformation outputs the T-Dimensional
* approximation of all these input vectors and matrices (where T<=min(D,N)). The
* transformation matrix is essentially a DxT matrix, the columns of which correspond
* to the eigenvectors of the covariance matrix(XX') having top T eigenvalues.
*
* This class provides 3 method options to compute the transformation matrix :
* <em>EVD</em> : Eigen Value Decomposition of Covariance Matrix (\f$XX^T\f$)
* The covariance matrix \f$XX^T\f$ is first formed internally and then
* its eigenvectors and eigenvalues are computed.
* The time complexity of this method is \f$~10D^3\f$ and should be used when N > D.
*
* <em>SVD</em> : Singular Value Decomposition of feature matrix X
* The transpose of feature matrix, \f$X^T\f$, is decomposed using SVD.\f$X^T = UDV^T\f$
* The matrix V in this decomposition contains the required eigenvectors and
* the diagonal entries of the diagonal matrix D correspond to the non-negative
* eigenvalues. Eigenvalue, \f$e_i\f$, is derived from a diagonal element, \f$d_i\f$,
* using the formula \f$e_i = \frac{\sqrt{d_i}}{N-1}\f$.
* The time complexity of this method is \f$~14DN^2\f$ and should be used when N < D.
*
* <em>AUTO</em> : This mode automagically chooses one of the above modes for the user
* based on whether N > D (chooses EVD) or N < D (chooses SVD).
*
* This class provides 3 modes to determine the value of T :
*
* <em>FIXED_NUMBER</em> : T is supplied by user directly using set_target_dims method
*
* <em>VARIANCE_EXPLAINED</em> : The user supplies the fractional variance that he
* wants preserved in the target dimension T. From this supplied fractional variance (thresh),
* T is calculated as the smallest k such that the ratio of sum of largest k eigenvalues
* over total sum of all eigenvalues is greater than thresh.
*
* <em>THRESH</em> : The user supplies a threshold. All eigenvectors with corresponding eigenvalue
* greater than the supplied threshold are chosen.
*
* An option for whitening the transformation matrix is also given - do_whitening. Setting this
* option normalizes the eigenvectors (ie. the columns of transformation matrix) by dividing them
* with the square root of corresponding eigenvalues.
*
* Note that vectors/matrices don't have to have zero mean as it is substracted within the class.
*/
class CPCA: public CDimensionReductionPreprocessor
{
public:
/** standard constructor
*
* @param do_whitening normalize columns(eigenvectors) in transformation matrix
* @param mode mode of pca : FIXED_NUMBER/VARIANCE_EXPLAINED/THRESHOLD
* @param thresh threshold value for VARIANCE_EXPLAINED or THRESHOLD mode
* @param method Matrix decomposition method used : SVD/EVD/AUTO[default]
* @param mem_mode memory usage mode of PCA : MEM_REALLOCATE/MEM_IN_PLACE
*/
CPCA(bool do_whitening=false, EPCAMode mode=FIXED_NUMBER, float64_t thresh=1e-6,
EPCAMethod method=AUTO, EPCAMemoryMode mem_mode=MEM_REALLOCATE);
/** special constructor for FIXED_NUMBER mode
*
* @param method Matrix decomposition method used : SVD/EVD/AUTO[default]
* @param do_whitening normalize columns(eigenvectors) in transformation matrix
* @param mem memory usage mode of PCA : MEM_REALLOCATE/MEM_IN_PLACE
*/
CPCA(EPCAMethod method, bool do_whitening=false, EPCAMemoryMode mem=MEM_REALLOCATE);
/** destructor */
virtual ~CPCA();
virtual void fit(CFeatures* features);
/** cleanup */
virtual void cleanup();
/** apply preprocessor to feature matrix
* @param features features
* @return processed feature matrix
*/
virtual SGMatrix<float64_t> apply_to_feature_matrix(CFeatures* features);
/** apply preprocessor to feature vector
* @param vector feature vector
* @return processed feature vector
*/
virtual SGVector<float64_t> apply_to_feature_vector(SGVector<float64_t> vector);
/** get transformation matrix, i.e. eigenvectors (potentially scaled if
* do_whitening is true)
*/
SGMatrix<float64_t> get_transformation_matrix();
/** get eigenvalues of PCA
*/
SGVector<float64_t> get_eigenvalues();
/** get mean vector of original data
*/
SGVector<float64_t> get_mean();
/** @return object name */
virtual const char* get_name() const { return "PCA"; }
/** @return a type of preprocessor */
virtual EPreprocessorType get_type() const { return P_PCA; }
/** return the PCA memory mode being used */
EPCAMemoryMode get_memory_mode() const;
/** set PCA memory mode to be used
* @param e hoice between MEM_REALLOCATE and MEM_IN_PLACE
*/
void set_memory_mode(EPCAMemoryMode e);
/** set zero tolerance of eigenvalues during data whitening
* @param eigenvalue_zero_tolerance zero tolerance value
*/
void set_eigenvalue_zero_tolerance(float64_t eigenvalue_zero_tolerance=1e-15);
/** get zero tolerance of eigenvalues during data whitening
* @return zero tolerance value
*/
float64_t get_eigenvalue_zero_tolerance() const;
protected:
void init();
protected:
/** transformation matrix */
SGMatrix<float64_t> m_transformation_matrix;
/** num dim */
int32_t num_dim;
/** num old dim */
int32_t num_old_dim;
/** mean vector */
SGVector<float64_t> m_mean_vector;
/** eigenvalues vector */
SGVector<float64_t> m_eigenvalues_vector;
/** initialized */
bool m_initialized;
/** whitening */
bool m_whitening;
/** PCA mode */
EPCAMode m_mode;
/** thresh */
float64_t m_thresh;
/** PCA memory mode */
EPCAMemoryMode m_mem_mode;
/** PCA method */
EPCAMethod m_method;
/** eigenvalues within zero tolerance
* region are considered 0 while
* whitening to tackle numerical issues
*/
float64_t m_eigenvalue_zero_tolerance;
private:
/** Computes the transformation matrix using an eigenvalue decomposition. */
void init_with_evd(const SGMatrix<float64_t>& feature_matrix, int32_t max_dim_allowed);
/** Computes the transformation matrix using svd */
void init_with_svd(const SGMatrix<float64_t>& feature_matrix, int32_t max_dim_allowed);
};
}
#endif // PCA_H_