offer alternative show_topic[s] for LSA and LDA models, which can ret…

…urn arrays instad of logging formatted strings
strongh · Aug 26, 2011 · 5b10b6d · 5b10b6d
1 parent 9bef5b0
commit 5b10b6d
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 13 deletions.
diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -540,8 +540,10 @@ def bound(self, corpus, gamma=None):
 
         return score
 
-
     def print_topics(self, topics=10, topn=10):
+        self.show_topics(topics, topn, True)
+
+    def show_topics(self, topics=10, topn=10, log=False, formatted=True):
         """
         Print the `topN` most probable words for (randomly selected) `topics`
         number of topics. Set `topics=-1` to print all topics.
@@ -554,16 +556,26 @@ def print_topics(self, topics=10, topn=10):
             # print all topics if `topics` is negative
             topics = self.num_topics
         topics = min(topics, self.num_topics)
+        shown  = []
         for i in xrange(topics):
-            logger.info("topic #%i: %s" % (i, self.print_topic(i, topn=topn)))
-
+            if formatted:
+                topic = self.print_topic(i, topn=topn)
+            else:
+                topic = self.show_topic(i, topn=topn)
+            shown.append(topic)
+            if log:
+                logger.info("topic #%i: %s" % (i, topic))
+        return shown
 
-    def print_topic(self, topicid, topn=10):
+    def show_topic(self, topicid, topn=10):
         topic = self.expElogbeta[topicid]
         topic = topic / topic.sum() # normalize to probability dist
         bestn = numpy.argsort(topic)[::-1][:topn]
-        beststr = ['%.3f*%s' % (topic[id], self.id2word[id]) for id in bestn]
-        return ' + '.join(beststr)
+        beststr = [(topic[id], self.id2word[id]) for id in bestn]
+        return beststr
+
+    def print_topic(self, topicid, topn=10):
+        return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, topn)])
 
 
     def __getitem__(self, bow, eps=0.01):

diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py
@@ -425,7 +425,7 @@ def __getitem__(self, bow, scaled=False, chunksize=512):
         return result
 
 
-    def print_topic(self, topicno, topn=10):
+    def show_topic(self, topicno, topn=10):
         """
         Return a specified topic (=left singular vector), 0 <= `topicno` < `self.num_topics`,
         as string.
@@ -445,16 +445,28 @@ def print_topic(self, topicno, topn=10):
         c = numpy.asarray(self.projection.u.T[topicno, :]).flatten()
         norm = numpy.sqrt(numpy.sum(numpy.dot(c, c)))
         most = numpy.abs(c).argsort()[::-1][:topn]
-        return ' + '.join(['%.3f*"%s"' % (1.0 * c[val] / norm, self.id2word[val]) for val in most])
-
+        return [(1.0 * c[val] / norm, self.id2word[val]) for val in most]
 
-    def print_topics(self, num_topics=5, num_words=10):
+    def print_topic(self, topicno, topn=10):
+        return ' + '.join(['%.3f*"%s"' % v for v in self.show_topic(topicno, topn)])
+
+    def show_topics(self, num_topics=5, num_words=10, log=False, formatted=True):
+        shown = []
         for i in xrange(min(num_topics, self.num_topics)):
             if i < len(self.projection.s):
-                logger.info("topic #%i(%.3f): %s" %
-                            (i, self.projection.s[i],
-                             self.print_topic(i, topn=num_words)))
+                if formatted:
+                    topic = self.print_topic(i, topn=num_words)
+                else:
+                    topic = self.show_topic(i, topn=num_words)
+                shown.append(topic)
+                if log:
+                    logger.info("topic #%i(%.3f): %s" %
+                                (i, self.projection.s[i],
+                                 topic))
+        return shown
 
+    def print_topics(self, num_topics=5, num_words=10):
+        self.show_topics(num_topics=5, num_words=10, log=True)
 
     def print_debug(self, num_topics=5, num_words=10):
         """