From fb70d6d36a1c212ab0d4ad59b9db10e228173f57 Mon Sep 17 00:00:00 2001 From: Michael Bargatin Date: Tue, 23 May 2017 18:20:12 +0300 Subject: [PATCH 1/7] Added n_components parameter to LatentDirichletAllocation to replace n_topics, which is still kept for backward compatibility. Signed-off-by: Michael Bargatin --- sklearn/decomposition/online_lda.py | 54 +++++++++++++++++------------ 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 4717bd5af80a3..c5eddc38688cf 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -143,17 +143,17 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): Parameters ---------- - n_topics : int, optional (default=10) + n_components : int, optional (default=10) Number of topics. doc_topic_prior : float, optional (default=None) Prior of document topic distribution `theta`. If the value is None, - defaults to `1 / n_topics`. + defaults to `1 / n_components`. In the literature, this is called `alpha`. topic_word_prior : float, optional (default=None) Prior of topic word distribution `beta`. If the value is None, defaults - to `1 / n_topics`. + to `1 / n_components`. In the literature, this is called `eta`. learning_method : 'batch' | 'online', default='online' @@ -227,7 +227,7 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): Attributes ---------- - components_ : array, [n_topics, n_features] + components_ : array, [n_components, n_features] Variational parameters for topic word distribution. Since the complete conditional for topic word distribution is a Dirichlet, ``components_[i, j]`` can be viewed as pseudocount that represents the @@ -241,6 +241,9 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): n_iter_ : int Number of passes over the dataset. + + n_topics : int, optional (default=10) + Same as n_components, kept for backward compatibility References ---------- @@ -255,13 +258,13 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): """ - def __init__(self, n_topics=10, doc_topic_prior=None, + def __init__(self, n_components=10, doc_topic_prior=None, topic_word_prior=None, learning_method=None, learning_decay=.7, learning_offset=10., max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1e6, perp_tol=1e-1, mean_change_tol=1e-3, max_doc_update_iter=100, - n_jobs=1, verbose=0, random_state=None): - self.n_topics = n_topics + n_jobs=1, verbose=0, random_state=None, n_topics=10): + self.n_components = n_components self.doc_topic_prior = doc_topic_prior self.topic_word_prior = topic_word_prior self.learning_method = learning_method @@ -277,13 +280,18 @@ def __init__(self, n_topics=10, doc_topic_prior=None, self.n_jobs = n_jobs self.verbose = verbose self.random_state = random_state + if n_components == 10 and n_topics != 10: + self.n_components = n_topics + warnings.warn("n_topics has been deprecated in favor of n_components", DeprecationWarning) + def _check_params(self): """Check model parameters.""" - if self.n_topics <= 0: - raise ValueError("Invalid 'n_topics' parameter: %r" - % self.n_topics) + if self.n_components <= 0: + raise ValueError("Invalid 'n_components' parameter: %r" + % self.n_components) + if self.total_samples <= 0: raise ValueError("Invalid 'total_samples' parameter: %r" @@ -305,12 +313,12 @@ def _init_latent_vars(self, n_features): self.n_iter_ = 0 if self.doc_topic_prior is None: - self.doc_topic_prior_ = 1. / self.n_topics + self.doc_topic_prior_ = 1. / self.n_components else: self.doc_topic_prior_ = self.doc_topic_prior if self.topic_word_prior is None: - self.topic_word_prior_ = 1. / self.n_topics + self.topic_word_prior_ = 1. / self.n_components else: self.topic_word_prior_ = self.topic_word_prior @@ -318,7 +326,7 @@ def _init_latent_vars(self, n_features): init_var = 1. / init_gamma # In the literature, this is called `lambda` self.components_ = self.random_state_.gamma( - init_gamma, init_var, (self.n_topics, n_features)) + init_gamma, init_var, (self.n_components, n_features)) # In the literature, this is `exp(E[log(beta)])` self.exp_dirichlet_component_ = np.exp( @@ -409,7 +417,7 @@ def _em_step(self, X, total_samples, batch_update, parallel=None): Returns ------- - doc_topic_distr : array, shape=(n_samples, n_topics) + doc_topic_distr : array, shape=(n_samples, n_components) Unnormalized document topic distribution. """ @@ -569,7 +577,7 @@ def _unnormalized_transform(self, X): Returns ------- - doc_topic_distr : shape=(n_samples, n_topics) + doc_topic_distr : shape=(n_samples, n_components) Document topic distribution for X. """ if not hasattr(self, 'components_'): @@ -603,7 +611,7 @@ def transform(self, X): Returns ------- - doc_topic_distr : shape=(n_samples, n_topics) + doc_topic_distr : shape=(n_samples, n_components) Document topic distribution for X. """ doc_topic_distr = self._unnormalized_transform(X) @@ -622,7 +630,7 @@ def _approx_bound(self, X, doc_topic_distr, sub_sampling): X : array-like or sparse matrix, shape=(n_samples, n_features) Document word matrix. - doc_topic_distr : array, shape=(n_samples, n_topics) + doc_topic_distr : array, shape=(n_samples, n_components) Document topic distribution. In the literature, this is called gamma. @@ -644,7 +652,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size): return score is_sparse_x = sp.issparse(X) - n_samples, n_topics = doc_topic_distr.shape + n_samples, n_components = doc_topic_distr.shape n_features = self.components_.shape[1] score = 0 @@ -673,7 +681,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size): # compute E[log p(theta | alpha) - log q(theta | gamma)] score += _loglikelihood(doc_topic_prior, doc_topic_distr, - dirichlet_doc_topic, self.n_topics) + dirichlet_doc_topic, self.n_components) # Compensate for the subsampling of the population of documents if sub_sampling: @@ -717,7 +725,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, X : array-like or sparse matrix, [n_samples, n_features] Document word matrix. - doc_topic_distr : None or array, shape=(n_samples, n_topics) + doc_topic_distr : None or array, shape=(n_samples, n_components) Document topic distribution. If it is None, it will be generated by applying transform on X. @@ -736,12 +744,12 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, if doc_topic_distr is None: doc_topic_distr = self._unnormalized_transform(X) else: - n_samples, n_topics = doc_topic_distr.shape + n_samples, n_components = doc_topic_distr.shape if n_samples != X.shape[0]: raise ValueError("Number of samples in X and doc_topic_distr" " do not match.") - if n_topics != self.n_topics: + if n_components != self.n_components: raise ValueError("Number of topics does not match.") current_samples = X.shape[0] @@ -769,7 +777,7 @@ def perplexity(self, X, doc_topic_distr='deprecated', sub_sampling=False): X : array-like or sparse matrix, [n_samples, n_features] Document word matrix. - doc_topic_distr : None or array, shape=(n_samples, n_topics) + doc_topic_distr : None or array, shape=(n_samples, n_components) Document topic distribution. This argument is deprecated and is currently being ignored. From 58085be17b5230f10fef64b47ea2fa89735418d4 Mon Sep 17 00:00:00 2001 From: Michael Bargatin Date: Wed, 24 May 2017 07:04:36 +0300 Subject: [PATCH 2/7] Applied a few changes Signed-off-by: Michael Bargatin --- sklearn/decomposition/online_lda.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index c5eddc38688cf..2f1e0c1930337 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -243,7 +243,9 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): Number of passes over the dataset. n_topics : int, optional (default=10) - Same as n_components, kept for backward compatibility + .. deprecated:: 0.19 + This parameter will be removed in 0.21. + Use :param:`n_components` instead. References ---------- @@ -263,7 +265,7 @@ def __init__(self, n_components=10, doc_topic_prior=None, learning_decay=.7, learning_offset=10., max_iter=10, batch_size=128, evaluate_every=-1, total_samples=1e6, perp_tol=1e-1, mean_change_tol=1e-3, max_doc_update_iter=100, - n_jobs=1, verbose=0, random_state=None, n_topics=10): + n_jobs=1, verbose=0, random_state=None, n_topics=None): self.n_components = n_components self.doc_topic_prior = doc_topic_prior self.topic_word_prior = topic_word_prior @@ -280,19 +282,19 @@ def __init__(self, n_components=10, doc_topic_prior=None, self.n_jobs = n_jobs self.verbose = verbose self.random_state = random_state - if n_components == 10 and n_topics != 10: - self.n_components = n_topics - warnings.warn("n_topics has been deprecated in favor of n_components", DeprecationWarning) - + self.n_topics = n_topics def _check_params(self): """Check model parameters.""" + if self.n_topics is not None: + self.n_components = self.n_topics + warnings.warn("n_topics has been renamed to n_components in version 0.19 " + "and will be removed in 0.21", DeprecationWarning) if self.n_components <= 0: raise ValueError("Invalid 'n_components' parameter: %r" % self.n_components) - if self.total_samples <= 0: raise ValueError("Invalid 'total_samples' parameter: %r" % self.total_samples) From 3791af04da02ea4ef8f7d294634a8070329799fb Mon Sep 17 00:00:00 2001 From: Michael Bargatin Date: Wed, 24 May 2017 10:33:33 +0300 Subject: [PATCH 3/7] Changed info in docstring. Signed-off-by: Michael Bargatin --- sklearn/decomposition/online_lda.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 2f1e0c1930337..75ef2d7e92cb1 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -224,7 +224,12 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. - + + n_topics : int, optional (default=10) + This parameter has been renamed to n_components and will + be removed in version 0.21. + .. deprecated:: 0.19 + Attributes ---------- components_ : array, [n_components, n_features] @@ -241,11 +246,6 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): n_iter_ : int Number of passes over the dataset. - - n_topics : int, optional (default=10) - .. deprecated:: 0.19 - This parameter will be removed in 0.21. - Use :param:`n_components` instead. References ---------- From 80c9b2f08da32412b9fff8842239117cef603ffe Mon Sep 17 00:00:00 2001 From: Michael Bargatin Date: Wed, 24 May 2017 11:45:02 +0300 Subject: [PATCH 4/7] Added _n_components to LatentDirichletAllocation Signed-off-by: Michael Bargatin --- sklearn/decomposition/online_lda.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 75ef2d7e92cb1..965ecaa20e5ed 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -266,7 +266,7 @@ def __init__(self, n_components=10, doc_topic_prior=None, batch_size=128, evaluate_every=-1, total_samples=1e6, perp_tol=1e-1, mean_change_tol=1e-3, max_doc_update_iter=100, n_jobs=1, verbose=0, random_state=None, n_topics=None): - self.n_components = n_components + self._n_components = n_components self.doc_topic_prior = doc_topic_prior self.topic_word_prior = topic_word_prior self.learning_method = learning_method @@ -287,13 +287,15 @@ def __init__(self, n_components=10, doc_topic_prior=None, def _check_params(self): """Check model parameters.""" if self.n_topics is not None: - self.n_components = self.n_topics + self._n_components = self.n_topics warnings.warn("n_topics has been renamed to n_components in version 0.19 " "and will be removed in 0.21", DeprecationWarning) + else: + self._n_components = self._n_components - if self.n_components <= 0: + if self._n_components <= 0: raise ValueError("Invalid 'n_components' parameter: %r" - % self.n_components) + % self._n_components) if self.total_samples <= 0: raise ValueError("Invalid 'total_samples' parameter: %r" @@ -315,12 +317,12 @@ def _init_latent_vars(self, n_features): self.n_iter_ = 0 if self.doc_topic_prior is None: - self.doc_topic_prior_ = 1. / self.n_components + self.doc_topic_prior_ = 1. / self._n_components else: self.doc_topic_prior_ = self.doc_topic_prior if self.topic_word_prior is None: - self.topic_word_prior_ = 1. / self.n_components + self.topic_word_prior_ = 1. / self._n_components else: self.topic_word_prior_ = self.topic_word_prior @@ -328,7 +330,7 @@ def _init_latent_vars(self, n_features): init_var = 1. / init_gamma # In the literature, this is called `lambda` self.components_ = self.random_state_.gamma( - init_gamma, init_var, (self.n_components, n_features)) + init_gamma, init_var, (self._n_components, n_features)) # In the literature, this is `exp(E[log(beta)])` self.exp_dirichlet_component_ = np.exp( @@ -683,7 +685,7 @@ def _loglikelihood(prior, distr, dirichlet_distr, size): # compute E[log p(theta | alpha) - log q(theta | gamma)] score += _loglikelihood(doc_topic_prior, doc_topic_distr, - dirichlet_doc_topic, self.n_components) + dirichlet_doc_topic, self._n_components) # Compensate for the subsampling of the population of documents if sub_sampling: @@ -751,7 +753,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, raise ValueError("Number of samples in X and doc_topic_distr" " do not match.") - if n_components != self.n_components: + if n_components != self._n_components: raise ValueError("Number of topics does not match.") current_samples = X.shape[0] From 7131b2d00d92256e98dbb11edde32b4d14680a03 Mon Sep 17 00:00:00 2001 From: Michael Bargatin Date: Thu, 25 May 2017 06:50:05 +0300 Subject: [PATCH 5/7] Added test for n_topics deprecation Signed-off-by: Michael Bargatin --- sklearn/decomposition/online_lda.py | 6 +- .../decomposition/tests/test_online_lda.py | 105 ++++++++++-------- 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/sklearn/decomposition/online_lda.py b/sklearn/decomposition/online_lda.py index 965ecaa20e5ed..657ce3ece7e3f 100644 --- a/sklearn/decomposition/online_lda.py +++ b/sklearn/decomposition/online_lda.py @@ -225,7 +225,7 @@ class LatentDirichletAllocation(BaseEstimator, TransformerMixin): If None, the random number generator is the RandomState instance used by `np.random`. - n_topics : int, optional (default=10) + n_topics : int, optional (default=None) This parameter has been renamed to n_components and will be removed in version 0.21. .. deprecated:: 0.19 @@ -266,7 +266,7 @@ def __init__(self, n_components=10, doc_topic_prior=None, batch_size=128, evaluate_every=-1, total_samples=1e6, perp_tol=1e-1, mean_change_tol=1e-3, max_doc_update_iter=100, n_jobs=1, verbose=0, random_state=None, n_topics=None): - self._n_components = n_components + self.n_components = n_components self.doc_topic_prior = doc_topic_prior self.topic_word_prior = topic_word_prior self.learning_method = learning_method @@ -291,7 +291,7 @@ def _check_params(self): warnings.warn("n_topics has been renamed to n_components in version 0.19 " "and will be removed in 0.21", DeprecationWarning) else: - self._n_components = self._n_components + self._n_components = self.n_components if self._n_components <= 0: raise ValueError("Invalid 'n_components' parameter: %r" diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index c3a221fe4800a..0fb12715a88ec 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -2,6 +2,7 @@ from scipy.linalg import block_diag from scipy.sparse import csr_matrix from scipy.special import psi +import warnings from sklearn.decomposition import LatentDirichletAllocation from sklearn.decomposition._online_lda import (_dirichlet_expectation_1d, @@ -23,22 +24,22 @@ def _build_sparse_mtx(): # Create 3 topics and each topic has 3 distinct words. # (Each word only belongs to a single topic.) - n_topics = 3 - block = n_topics * np.ones((3, 3)) - blocks = [block] * n_topics + n_components = 3 + block = n_components * np.ones((3, 3)) + blocks = [block] * n_components X = block_diag(*blocks) X = csr_matrix(X) - return (n_topics, X) + return (n_components, X) def test_lda_default_prior_params(): # default prior parameter should be `1 / topics` # and verbose params should not affect result - n_topics, X = _build_sparse_mtx() - prior = 1. / n_topics - lda_1 = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=prior, + n_components, X = _build_sparse_mtx() + prior = 1. / n_components + lda_1 = LatentDirichletAllocation(n_components=n_components, doc_topic_prior=prior, topic_word_prior=prior, random_state=0) - lda_2 = LatentDirichletAllocation(n_topics=n_topics, random_state=0) + lda_2 = LatentDirichletAllocation(n_components=n_components, random_state=0) topic_distr_1 = lda_1.fit_transform(X) topic_distr_2 = lda_2.fit_transform(X) @@ -48,8 +49,8 @@ def test_lda_default_prior_params(): def test_lda_fit_batch(): # Test LDA batch learning_offset (`fit` method with 'batch' learning) rng = np.random.RandomState(0) - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, evaluate_every=1, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, evaluate_every=1, learning_method='batch', random_state=rng) lda.fit(X) @@ -63,8 +64,8 @@ def test_lda_fit_batch(): def test_lda_fit_online(): # Test LDA online learning (`fit` method with 'online' learning) rng = np.random.RandomState(0) - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10., + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, learning_offset=10., evaluate_every=1, learning_method='online', random_state=rng) lda.fit(X) @@ -80,8 +81,8 @@ def test_lda_partial_fit(): # Test LDA online learning (`partial_fit` method) # (same as test_lda_batch) rng = np.random.RandomState(0) - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10., + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, learning_offset=10., total_samples=100, random_state=rng) for i in xrange(3): lda.partial_fit(X) @@ -95,8 +96,8 @@ def test_lda_partial_fit(): def test_lda_dense_input(): # Test LDA with dense input. rng = np.random.RandomState(0) - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='batch', + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, learning_method='batch', random_state=rng) lda.fit(X.toarray()) @@ -112,8 +113,8 @@ def test_lda_transform(): # Transform result cannot be negative and should be normalized rng = np.random.RandomState(0) X = rng.randint(5, size=(20, 10)) - n_topics = 3 - lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) + n_components = 3 + lda = LatentDirichletAllocation(n_components=n_components, random_state=rng) X_trans = lda.fit_transform(X) assert_true((X_trans > 0.0).any()) assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0])) @@ -125,7 +126,7 @@ def test_lda_fit_transform(): for method in ('online', 'batch'): rng = np.random.RandomState(0) X = rng.randint(10, size=(50, 20)) - lda = LatentDirichletAllocation(n_topics=5, learning_method=method, + lda = LatentDirichletAllocation(n_components=5, learning_method=method, random_state=rng) X_fit = lda.fit_transform(X) X_trans = lda.transform(X) @@ -135,11 +136,11 @@ def test_lda_fit_transform(): def test_lda_partial_fit_dim_mismatch(): # test `n_features` mismatch in `partial_fit` rng = np.random.RandomState(0) - n_topics = rng.randint(3, 6) + n_components = rng.randint(3, 6) n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) - lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., + lda = LatentDirichletAllocation(n_components=n_components, learning_offset=5., total_samples=20, random_state=rng) lda.partial_fit(X_1) assert_raises_regexp(ValueError, r"^The provided data has", @@ -151,7 +152,7 @@ def test_invalid_params(): X = np.ones((5, 10)) invalid_models = ( - ('n_topics', LatentDirichletAllocation(n_topics=0)), + ('n_components', LatentDirichletAllocation(n_components=0)), ('learning_method', LatentDirichletAllocation(learning_method='unknown')), ('total_samples', LatentDirichletAllocation(total_samples=0)), @@ -186,8 +187,8 @@ def test_lda_transform_mismatch(): X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) - n_topics = rng.randint(3, 6) - lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) + n_components = rng.randint(3, 6) + lda = LatentDirichletAllocation(n_components=n_components, random_state=rng) lda.partial_fit(X) assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2) @@ -195,11 +196,11 @@ def test_lda_transform_mismatch(): @if_safe_multiprocessing_with_blas def test_lda_multi_jobs(): - n_topics, X = _build_sparse_mtx() + n_components, X = _build_sparse_mtx() # Test LDA batch training with multi CPU for method in ('online', 'batch'): rng = np.random.RandomState(0) - lda = LatentDirichletAllocation(n_topics=n_topics, n_jobs=2, + lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, learning_method=method, evaluate_every=1, random_state=rng) @@ -215,8 +216,8 @@ def test_lda_multi_jobs(): def test_lda_partial_fit_multi_jobs(): # Test LDA online training with multi CPU rng = np.random.RandomState(0) - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, n_jobs=2, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, learning_offset=5., total_samples=30, random_state=rng) for i in range(2): @@ -231,31 +232,31 @@ def test_lda_partial_fit_multi_jobs(): def test_lda_preplexity_mismatch(): # test dimension mismatch in `perplexity` method rng = np.random.RandomState(0) - n_topics = rng.randint(3, 6) + n_components = rng.randint(3, 6) n_samples = rng.randint(6, 10) X = np.random.randint(4, size=(n_samples, 10)) - lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., + lda = LatentDirichletAllocation(n_components=n_components, learning_offset=5., total_samples=20, random_state=rng) lda.fit(X) # invalid samples - invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_topics)) + invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components)) assert_raises_regexp(ValueError, r'Number of samples', lda._perplexity_precomp_distr, X, invalid_n_samples) # invalid topic number - invalid_n_topics = rng.randint(4, size=(n_samples, n_topics + 1)) + invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1)) assert_raises_regexp(ValueError, r'Number of topics', - lda._perplexity_precomp_distr, X, invalid_n_topics) + lda._perplexity_precomp_distr, X, invalid_n_components) def test_lda_perplexity(): # Test LDA perplexity for batch training # perplexity should be lower after each iteration - n_topics, X = _build_sparse_mtx() + n_components, X = _build_sparse_mtx() for method in ('online', 'batch'): - lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) - lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, + lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit(X) @@ -273,12 +274,12 @@ def test_lda_perplexity(): def test_lda_score(): # Test LDA score for batch training # score should be higher after each iteration - n_topics, X = _build_sparse_mtx() + n_components, X = _build_sparse_mtx() for method in ('online', 'batch'): - lda_1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) - lda_2 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, + lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) @@ -292,8 +293,8 @@ def test_lda_score(): def test_perplexity_input_format(): # Test LDA perplexity for sparse and dense input # score should be the same for both dense and sparse input - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) lda.fit(X) @@ -304,8 +305,8 @@ def test_perplexity_input_format(): def test_lda_score_perplexity(): # Test the relationship between LDA score and perplexity - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, random_state=0) lda.fit(X) perplexity_1 = lda.perplexity(X, sub_sampling=False) @@ -318,8 +319,8 @@ def test_lda_score_perplexity(): def test_lda_fit_perplexity(): # Test that the perplexity computed during fit is consistent with what is # returned by the perplexity method - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', random_state=0, evaluate_every=1) lda.fit(X) @@ -336,8 +337,8 @@ def test_lda_fit_perplexity(): def test_doc_topic_distr_deprecation(): # Test that the appropriate warning message is displayed when a user # attempts to pass the doc_topic_distr argument to the perplexity method - n_topics, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=1, + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) distr1 = lda.fit_transform(X) @@ -367,3 +368,13 @@ def test_dirichlet_expectation(): assert_allclose(_dirichlet_expectation_2d(x), psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]), rtol=1e-11, atol=3e-9) + + +def test_lda_n_topics_deprecation(): + n_components, X = _build_sparse_mtx() + lda = LatentDirichletAllocation(n_topics=10) + with warnings.catch_warnings(record=True) as warning: + warnings.simplefilter("always") + lda.fit(X) + assert len(warning) == 1 + assert issubclass(warning[-1].category, DeprecationWarning) \ No newline at end of file From e137ffcdfcd0e4487543cb35c9e06c5ed9f04793 Mon Sep 17 00:00:00 2001 From: Michael Bargatin Date: Thu, 25 May 2017 10:49:02 +0300 Subject: [PATCH 6/7] Changed usage examples to reflect the replacement of n_topics with n_components in LatentDirichletAllocation Signed-off-by: Michael Bargatin --- .../plot_topics_extraction_with_nmf_lda.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/applications/plot_topics_extraction_with_nmf_lda.py b/examples/applications/plot_topics_extraction_with_nmf_lda.py index e1a6f0bdbacd9..04ab2809f36b1 100644 --- a/examples/applications/plot_topics_extraction_with_nmf_lda.py +++ b/examples/applications/plot_topics_extraction_with_nmf_lda.py @@ -14,7 +14,7 @@ functions: the Frobenius norm, and the generalized Kullback-Leibler divergence. The latter is equivalent to Probabilistic Latent Semantic Indexing. -The default parameters (n_samples / n_features / n_topics) should make +The default parameters (n_samples / n_features / n_components) should make the example runnable in a couple of tens of seconds. You can try to increase the dimensions of the problem, but be aware that the time complexity is polynomial in NMF. In LDA, the time complexity is @@ -36,7 +36,7 @@ n_samples = 2000 n_features = 1000 -n_topics = 10 +n_components = 10 n_top_words = 20 @@ -85,7 +85,7 @@ def print_top_words(model, feature_names, n_top_words): "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() -nmf = NMF(n_components=n_topics, random_state=1, +nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) @@ -98,7 +98,7 @@ def print_top_words(model, feature_names, n_top_words): "tf-idf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() -nmf = NMF(n_components=n_topics, random_state=1, beta_loss='kullback-leibler', +nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf) print("done in %0.3fs." % (time() - t0)) @@ -109,7 +109,7 @@ def print_top_words(model, feature_names, n_top_words): print("Fitting LDA models with tf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) -lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, +lda = LatentDirichletAllocation(n_components=n_components, max_iter=5, learning_method='online', learning_offset=50., random_state=0) From 54cdb09dc37255885d69c29b14da0dde2daf6756 Mon Sep 17 00:00:00 2001 From: Michael Bargatin Date: Thu, 25 May 2017 15:20:09 +0300 Subject: [PATCH 7/7] Used assert_warns to check for DeprecationWarning Signed-off-by: Michael Bargatin --- sklearn/decomposition/tests/test_online_lda.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sklearn/decomposition/tests/test_online_lda.py b/sklearn/decomposition/tests/test_online_lda.py index 0fb12715a88ec..597681dcf8118 100644 --- a/sklearn/decomposition/tests/test_online_lda.py +++ b/sklearn/decomposition/tests/test_online_lda.py @@ -372,9 +372,5 @@ def test_dirichlet_expectation(): def test_lda_n_topics_deprecation(): n_components, X = _build_sparse_mtx() - lda = LatentDirichletAllocation(n_topics=10) - with warnings.catch_warnings(record=True) as warning: - warnings.simplefilter("always") - lda.fit(X) - assert len(warning) == 1 - assert issubclass(warning[-1].category, DeprecationWarning) \ No newline at end of file + lda = LatentDirichletAllocation(n_topics=10, learning_method='batch') + assert_warns(DeprecationWarning, lda.fit, X) \ No newline at end of file