Skip to content

Commit

Permalink
offer alternative show_topic[s] for LSA and LDA models, which can ret…
Browse files Browse the repository at this point in the history
…urn arrays instad of logging formatted strings
  • Loading branch information
strongh committed Aug 26, 2011
1 parent 9bef5b0 commit 5b10b6d
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 13 deletions.
24 changes: 18 additions & 6 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,8 +540,10 @@ def bound(self, corpus, gamma=None):

return score


def print_topics(self, topics=10, topn=10):
self.show_topics(topics, topn, True)

def show_topics(self, topics=10, topn=10, log=False, formatted=True):
"""
Print the `topN` most probable words for (randomly selected) `topics`
number of topics. Set `topics=-1` to print all topics.
Expand All @@ -554,16 +556,26 @@ def print_topics(self, topics=10, topn=10):
# print all topics if `topics` is negative
topics = self.num_topics
topics = min(topics, self.num_topics)
shown = []
for i in xrange(topics):
logger.info("topic #%i: %s" % (i, self.print_topic(i, topn=topn)))

if formatted:
topic = self.print_topic(i, topn=topn)
else:
topic = self.show_topic(i, topn=topn)
shown.append(topic)
if log:
logger.info("topic #%i: %s" % (i, topic))
return shown

def print_topic(self, topicid, topn=10):
def show_topic(self, topicid, topn=10):
topic = self.expElogbeta[topicid]
topic = topic / topic.sum() # normalize to probability dist
bestn = numpy.argsort(topic)[::-1][:topn]
beststr = ['%.3f*%s' % (topic[id], self.id2word[id]) for id in bestn]
return ' + '.join(beststr)
beststr = [(topic[id], self.id2word[id]) for id in bestn]
return beststr

def print_topic(self, topicid, topn=10):
return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, topn)])


def __getitem__(self, bow, eps=0.01):
Expand Down
26 changes: 19 additions & 7 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ def __getitem__(self, bow, scaled=False, chunksize=512):
return result


def print_topic(self, topicno, topn=10):
def show_topic(self, topicno, topn=10):
"""
Return a specified topic (=left singular vector), 0 <= `topicno` < `self.num_topics`,
as string.
Expand All @@ -445,16 +445,28 @@ def print_topic(self, topicno, topn=10):
c = numpy.asarray(self.projection.u.T[topicno, :]).flatten()
norm = numpy.sqrt(numpy.sum(numpy.dot(c, c)))
most = numpy.abs(c).argsort()[::-1][:topn]
return ' + '.join(['%.3f*"%s"' % (1.0 * c[val] / norm, self.id2word[val]) for val in most])

return [(1.0 * c[val] / norm, self.id2word[val]) for val in most]

def print_topics(self, num_topics=5, num_words=10):
def print_topic(self, topicno, topn=10):
return ' + '.join(['%.3f*"%s"' % v for v in self.show_topic(topicno, topn)])

def show_topics(self, num_topics=5, num_words=10, log=False, formatted=True):
shown = []
for i in xrange(min(num_topics, self.num_topics)):
if i < len(self.projection.s):
logger.info("topic #%i(%.3f): %s" %
(i, self.projection.s[i],
self.print_topic(i, topn=num_words)))
if formatted:
topic = self.print_topic(i, topn=num_words)
else:
topic = self.show_topic(i, topn=num_words)
shown.append(topic)
if log:
logger.info("topic #%i(%.3f): %s" %
(i, self.projection.s[i],
topic))
return shown

def print_topics(self, num_topics=5, num_words=10):
self.show_topics(num_topics=5, num_words=10, log=True)

def print_debug(self, num_topics=5, num_words=10):
"""
Expand Down

0 comments on commit 5b10b6d

Please sign in to comment.