In [25]:
from ipstartup import *
import gensim
logging.getLogger("gensim").setLevel(logging.WARNING)
logging.getLogger("guidedlda").setLevel(logging.WARNING)

[root:INFO]:starting (cellevents.py:36, time=Mar-10 13:20)


time: 39 ms


# guidedLDA version

In [6]:
import numpy as np
import guidedlda

X = guidedlda.datasets.load_data(guidedlda.datasets.NYT)
vocab = guidedlda.datasets.load_vocab(guidedlda.datasets.NYT)
word2id = dict((v, idx) for idx, v in enumerate(vocab))

print(X.shape)

print(X.sum())
# Normal LDA without seeding
model = guidedlda.GuidedLDA(n_topics=5, n_iter=100, random_state=7, refresh=20)
model.fit(X)

topic_word = model.topic_word_
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))


# Guided LDA with seed topics.
seed_topic_list = [['game', 'team', 'win', 'player', 'season', 'second', 'victory'],
                   ['percent', 'company', 'market', 'price', 'sell', 'business', 'stock', 'share'],
                   ['music', 'write', 'art', 'book', 'world', 'film'],
                   ['political', 'government', 'leader', 'official', 'state', 'country', 'american', 'case', 'law', 
                    'police', 'charge', 'officer', 'kill', 'arrest', 'lawyer']]

model = guidedlda.GuidedLDA(n_topics=5, n_iter=100, random_state=7, refresh=20)

seed_topics = {}
for t_id, st in enumerate(seed_topic_list):
    for word in st:
        seed_topics[word2id[word]] = t_id

model.fit(X, seed_topics=seed_topics, seed_confidence=0.15)

n_top_words = 10
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

[root:INFO]:starting (cellevents.py:36, time=Mar-10 10:38)


(8447, 3012)
1221626




Topic 0: company percent market business plan pay price increase
Topic 1: game play team win player season second start
Topic 2: life child write man school woman father family
Topic 3: place open small house music turn large play
Topic 4: official state government political states issue leader case
Topic 0: game play team win season player second point start victory
Topic 1: company percent market price business sell pay plan executive buy
Topic 2: play life man music place turn book woman write thing
Topic 3: official government state political leader states issue member case country
Topic 4: school child city family problem student life program group state
time: 2min 42s


In [43]:
"""
unguided: business, sport, family?, place?, politics
seeds:    sport, business, culture, politics
guided:   sport, business, culture, politics, education

===> unguided has two poor/mixed topics
     guided finds the 4 seeded topics in sequence; plus clear 5th topic
"""
""

[root:INFO]:starting (cellevents.py:36, time=Mar-10 16:25)


''

time: 7.99 ms


# gensim version

In [99]:
def showtopics(model):
    """ print list of top words for each topic """
    for i in range(model.num_topics):
        print(i, " ".join([id2word[w[0]] for w in model.get_topic_terms(i)]))
        
def set_priors(eta, topic, words, p=.8):
    """ for list of words set p(topic)=p
    eta is topic*word matrix with default p=1/topics
    """
    word_indexes = [word2id[w] for w in words]
    #eta[:, word_indexes] = (1 - p) / (eta.shape[0]-1)
    eta[topic, word_indexes] *=1000

[root:INFO]:starting (cellevents.py:36, time=Mar-10 17:45)


time: 19 ms


In [34]:
# format doc/term matrix for gensim
x2 = []
for row in X:
    cols = np.nonzero(row)[0]
    vals = row[cols]
    cv = np.vstack([cols, vals]).transpose().tolist()
    x2.append(cv)

[root:INFO]:starting (cellevents.py:36, time=Mar-10 13:23)


time: 2.66 s


In [45]:
# unguided
n=5
id2word = dict((idx, v) for idx, v in enumerate(vocab))
m = gensim.models.ldamodel.LdaModel(x2, num_topics=n, id2word=id2word, passes=15, random_state=7)
showtopics(m)

[root:INFO]:starting (cellevents.py:36, time=Mar-10 16:27)


0 company percent market business price cost sell pay increase buy
1 school charge police case yesterday lawyer law man receive father
2 life thing man place old turn play woman live write
3 official state government political leader issue country states support force
4 game play team win player season second start point lose
time: 3min 12s


In [100]:
# guided
eta = np.full((n, len(id2word)), 1/(len(id2word)*n))
set_priors(eta, 0, ['game', 'team', 'win', 'player', 'season', 'second', 'victory'])
set_priors(eta, 1, ['percent', 'company', 'market', 'price', 'sell', 'business', 'stock', 'share'])
set_priors(eta, 2, ['music', 'write', 'art', 'book', 'world', 'film'])
set_priors(eta, 3, ['political', 'government', 'leader', 'official', 'state', 'country', 'american', 'case', 'law', 
                    'police', 'charge', 'officer', 'kill', 'arrest', 'lawyer'])
m = gensim.models.ldamodel.LdaModel(x2, num_topics=n, id2word=id2word, eta=eta, passes=15, random_state=7)
showtopics(m)

[root:INFO]:starting (cellevents.py:36, time=Mar-10 17:46)


0 company percent market business price cost sell pay increase buy
1 school charge police case yesterday lawyer law man father receive
2 life thing place man old turn play woman live write
3 official state government political leader issue country states support force
4 game play team win player season second start point lose
time: 3min 11s


In [44]:
"""
gensim unguided: business, law/school/father?, life/people?, politics, sport
seeds:           sport, business, culture, politics    
gensim guided:   no change
    
===> gensim unguided similar to guidedLDA with business/politics/sport clear + two unclear categories
     gensim guided produces identical result
"""
""

[root:INFO]:starting (cellevents.py:36, time=Mar-10 16:26)


''

time: 13 ms


In [105]:
np.round(eta,4), id2word[0], id2word[1], id2word[2], id2word[len(id2word)-2], id2word[len(id2word)-1]

[root:INFO]:starting (cellevents.py:36, time=Mar-10 17:56)


(array([[ 0.0001,  0.0001,  0.0001, ...,  0.0001,  0.0001,  0.0001],
        [ 0.0664,  0.0664,  0.0001, ...,  0.0001,  0.0001,  0.0001],
        [ 0.0001,  0.0001,  0.0001, ...,  0.0001,  0.0001,  0.0001],
        [ 0.0001,  0.0001,  0.0664, ...,  0.0001,  0.0001,  0.0001],
        [ 0.0001,  0.0001,  0.0001, ...,  0.0001,  0.0001,  0.0001]]),
 'company',
 'percent',
 'state',
 'harder',
 'lucrative')

time: 11 ms
