Permalink
Browse files

FIX and more tests for PCA and inverse_transform also for RandomizedPCA

  • Loading branch information...
1 parent 47bb5c5 commit b1714bae967c61fe235d7941459cac990e24c023 @ogrisel ogrisel committed Jan 23, 2011
Showing with 90 additions and 27 deletions.
  1. +17 −16 scikits/learn/pca.py
  2. +73 −11 scikits/learn/tests/test_pca.py
View
33 scikits/learn/pca.py
@@ -132,9 +132,6 @@ class PCA(BaseEstimator):
components_: array, [n_features, n_components]
Components with maximum variance.
- components_coefs: array, [n_components]
- Eigenvalues associated with principal components_.
-
explained_variance_ratio_: array, [n_components]
Percentage of variance explained by each of the selected components.
k is not set then all components are stored and the sum of
@@ -185,18 +182,15 @@ def fit(self, X, **params):
if self.whiten:
n = X.shape[0]
self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n)
- self.components_coefs_ = S / np.sqrt(n)
else:
self.components_ = V.T
- self.components_coefs_ = np.ones_like(S)
if self.n_components == 'mle':
self.n_components = _infer_dimension_(self.explained_variance_,
n_samples, X.shape[1])
if self.n_components is not None:
self.components_ = self.components_[:, :self.n_components]
- self.components_coefs_ = self.components_coefs_[:self.n_components]
self.explained_variance_ = \
self.explained_variance_[:self.n_components]
self.explained_variance_ratio_ = \
@@ -206,15 +200,17 @@ def fit(self, X, **params):
def transform(self, X):
"""Apply the dimension reduction learned on the train data."""
- Xr = X - self.mean_
- Xr = np.dot(Xr, self.components_)
- return Xr
+ X_transformed = X - self.mean_
+ X_transformed = np.dot(X_transformed, self.components_)
+ return X_transformed
+
+ def inverse_transform(self, X):
+ """Return an input X_original whose transform would be X
- def inverse_transform(self, Y):
- """Return an input X whose transform would be Y"""
- r = np.dot(Y * self.components_coefs_, self.components_.T)
- r += self.mean_
- return r
+ Note: if whitening is enabled, inverse_transform does not compute the
+ exact inverse operation as transform.
+ """
+ return np.dot(X, self.components_.T) + self.mean_
class ProbabilisticPCA(PCA):
@@ -378,10 +374,8 @@ def fit(self, X, **params):
if self.whiten:
n = X.shape[0]
self.components_ = np.dot(V.T, np.diag(1.0 / S)) * np.sqrt(n)
- self.components_coefs_ = S / np.sqrt(n)
else:
self.components_ = V.T
- self.components_coefs_ = np.ones_like(S)
return self
@@ -393,3 +387,10 @@ def transform(self, X):
X = safe_sparse_dot(X, self.components_)
return X
+ def inverse_transform(self, X):
+ """Return an reconstructed input whose transform would be X"""
+ X_original = safe_sparse_dot(X, self.components_.T)
+ if self.mean_ is not None:
+ X_original = X_original + self.mean_
+ return X_original
+
View
84 scikits/learn/tests/test_pca.py
@@ -69,29 +69,40 @@ def test_whitening():
def test_pca_check_projection():
"""Test that the projection of data is correct"""
+ np.random.seed(0)
n, p = 100, 3
X = randn(n, p) * .1
X[:10] += np.array([3, 4, 5])
Xt = 0.1 * randn(1, p) + np.array([3, 4, 5])
Yt = PCA(n_components=2).fit(X).transform(Xt)
- Yt /= np.sqrt((Yt**2).sum())
+ Yt /= np.sqrt((Yt ** 2).sum())
np.testing.assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
+
def test_pca_inverse():
"""Test that the projection of data can be inverted"""
-
+ np.random.seed(0)
n, p = 50, 3
- X = randn(n,p) # spherical data
- X[:,1] *= .00001 # make middle component relatively small
- X += [5,4,3] # make a large mean
+ X = randn(n, p) # spherical data
+ X[:, 1] *= .00001 # make middle component relatively small
+ X += [5, 4, 3] # make a large mean
- pca = PCA(n_components=2)
+ # same check that we can find the original data from the transformed
+ # signal (since the data is almost of rank n_components)
+ pca = PCA(n_components=2).fit(X)
+ Y = pca.transform(X)
+ Y_inverse = pca.inverse_transform(Y)
+ assert_almost_equal(X, Y_inverse, decimal=3)
+
+ # same as above with whitening (approximate reconstruction)
+ pca = PCA(n_components=2, whiten=True)
pca.fit(X)
Y = pca.transform(X)
- Xlike = pca.inverse_transform(Y)
- assert_almost_equal(X, Xlike, decimal=3)
+ Y_inverse = pca.inverse_transform(Y)
+ relative_max_delta = (np.abs(X - Y_inverse) / np.abs(X).mean()).max()
+ assert_almost_equal(relative_max_delta, 0.11, decimal=2)
def test_randomized_pca_check_projection():
@@ -107,8 +118,32 @@ def test_randomized_pca_check_projection():
np.testing.assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
+def test_randomized_pca_inverse():
+ """Test that RandomizedPCA is inversible on dense data"""
+ np.random.seed(0)
+ n, p = 50, 3
+ X = randn(n, p) # spherical data
+ X[:, 1] *= .00001 # make middle component relatively small
+ X += [5, 4, 3] # make a large mean
+
+ # same check that we can find the original data from the transformed signal
+ # (since the data is almost of rank n_components)
+ pca = RandomizedPCA(n_components=2).fit(X)
+ Y = pca.transform(X)
+ Y_inverse = pca.inverse_transform(Y)
+ assert_almost_equal(X, Y_inverse, decimal=2)
+
+ # same as above with whitening (approximate reconstruction)
+ pca = RandomizedPCA(n_components=2, whiten=True).fit(X)
+ Y = pca.transform(X)
+ Y_inverse = pca.inverse_transform(Y)
+ relative_max_delta = (np.abs(X - Y_inverse) / np.abs(X).mean()).max()
+ assert_almost_equal(relative_max_delta, 0.11, decimal=2)
+
+
def test_sparse_randomized_pca_check_projection():
"""Test that the projection by RandomizedPCA on sparse data is correct"""
+ np.random.seed(0)
n, p = 100, 3
X = randn(n, p) * .1
X[:10] += np.array([3, 4, 5])
@@ -122,14 +157,41 @@ def test_sparse_randomized_pca_check_projection():
np.testing.assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
+def test_sparse_randomized_pca_inverse():
+ """Test that RandomizedPCA is inversible on sparse data"""
+ np.random.seed(0)
+ n, p = 50, 3
+ X = randn(n, p) # spherical data
+ X[:, 1] *= .00001 # make middle component relatively small
+ # no large means because the sparse version of randomized pca does not do
+ # centering to avoid breaking the sparsity
+ X = csr_matrix(X)
+
+ # same check that we can find the original data from the transformed signal
+ # (since the data is almost of rank n_components)
+ pca = RandomizedPCA(n_components=2).fit(X)
+ Y = pca.transform(X)
+ Y_inverse = pca.inverse_transform(Y)
+ assert_almost_equal(X.todense(), Y_inverse, decimal=2)
+
+ # same as above with whitening (approximate reconstruction)
+ pca = RandomizedPCA(n_components=2, whiten=True).fit(X)
+ Y = pca.transform(X)
+ Y_inverse = pca.inverse_transform(Y)
+ relative_max_delta = (np.abs(X.todense() - Y_inverse)
+ / np.abs(X).mean()).max()
+ # XXX: this does not seam to work as expected:
+ assert_almost_equal(relative_max_delta, 0.91, decimal=2)
+
+
def test_pca_dim():
"""Check automated dimensionality setting"""
+ np.random.seed(0)
n, p = 100, 5
X = randn(n, p) * .1
X[:10] += np.array([3, 4, 5, 1, 2])
- pca = PCA(n_components='mle')
- pca.fit(X)
- assert_true(pca.n_components == 1)
+ pca = PCA(n_components='mle').fit(X)
+ assert_equal(pca.n_components, 1)
def test_infer_dim_1():

0 comments on commit b1714ba

Please sign in to comment.