In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [32]:
corpus = [
    "movies are a great way to pass your time",
    "but there are other ways to pass ones time",
    "i also like games like cricket and football",
    "cricket is very famous in india and australia",
    "america is also known for hollywood movies"
]

In [4]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [33]:
tfidf = tfidf_vectorizer.fit_transform(corpus)

In [34]:
tfidf_vectorizer.get_feature_names()

['america',
 'australia',
 'cricket',
 'famous',
 'football',
 'games',
 'great',
 'hollywood',
 'india',
 'known',
 'like',
 'movies',
 'ones',
 'pass',
 'time',
 'way',
 'ways']

In [35]:
tf_vectorizer = CountVectorizer(stop_words='english')
tf = tf_vectorizer.fit_transform(corpus)

In [36]:
tf_vectorizer.get_feature_names()

['america',
 'australia',
 'cricket',
 'famous',
 'football',
 'games',
 'great',
 'hollywood',
 'india',
 'known',
 'like',
 'movies',
 'ones',
 'pass',
 'time',
 'way',
 'ways']

In [37]:
nmf = NMF().fit(tfidf)

In [38]:
features = tf_vectorizer.get_feature_names()

In [49]:
for topic_idx, topic in enumerate(nmf.components_):
    topic_name = "Topic#" + str(topic_idx)
    indexes = topic.argsort()[::-1][:4]
    words = " ".join(features[index] for index in indexes)
    mapping = topic_name + " (" + words + ")"
    print(mapping)

Topic#0 (cricket india australia famous)
Topic#1 (ones ways pass time)
Topic#2 (movies way great pass)
Topic#3 (like football cricket games)
Topic#4 (america known hollywood movies)
Topic#5 (movies hollywood known america)
Topic#6 (great way time pass)
Topic#7 (ways time ones pass)
Topic#8 (india australia famous cricket)
Topic#9 (famous australia india cricket)
Topic#10 (famous india australia cricket)
Topic#11 (cricket like football games)
Topic#12 (like football games cricket)
Topic#13 (america known hollywood movies)
Topic#14 (cricket games football like)
Topic#15 (cricket like football games)
Topic#16 (australia cricket india famous)


In [44]:
lda = LatentDirichletAllocation()
lda.fit(tfidf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [50]:
for topic_idx, topic in enumerate(lda.components_):
    topic_name = "Topic#" + str(topic_idx)
    indexes = topic.argsort()[::-1][:4]
    words = " ".join(features[index] for index in indexes)
    mapping = topic_name + " (" + words + ")"
    print(mapping)

Topic#0 (ones ways pass time)
Topic#1 (america hollywood known movies)
Topic#2 (like football games cricket)
Topic#3 (hollywood known india ones)
Topic#4 (great way movies pass)
Topic#5 (australia ways way time)
Topic#6 (australia famous india cricket)
Topic#7 (ways football australia famous)
Topic#8 (cricket hollywood india america)
Topic#9 (india cricket famous australia)
