In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.decomposition import NMF

documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

vectorizer = TfidfVectorizer(stop_words='english')
tfidf = vectorizer.fit_transform(documents)
print(vectorizer.get_feature_names())

['100', 'app', 'belly', 'best', 'came', 'cat', 'chrome', 'climbing', 'eating', 'extension', 'face', 'feedback', 'google', 'impressed', 'incredible', 'key', 'kitten', 'kitty', 'little', 'map', 'merley', 'ninja', 'open', 'photo', 'play', 'promoter', 'restaurant', 'smiley', 'squooshy', 'tab', 'taken', 'translate', 've']


In [2]:
n_topics = 2
nmf = NMF(n_components=n_topics, random_state=1).fit(tfidf)

In [4]:
W = nmf.fit_transform(tfidf)
W

array([[0.        , 0.        ],
       [0.        , 0.45217213],
       [0.55735742, 0.        ],
       [0.49414046, 0.        ],
       [0.        , 0.74849032],
       [0.        , 0.5964714 ],
       [0.55735742, 0.        ],
       [0.52368298, 0.        ]])

In [6]:
H = nmf.components_
H.shape

(2, 33)

In [9]:
import numpy as np

# tfidf - np.dot(W, H)

In [10]:
n_top_words = 10
feature_names = vectorizer.get_feature_names()

for topic_idx, topic in enumerate(nmf.components_):
    print("Topic #%d:" % topic_idx)
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

Topic #0:
google feedback map app impressed incredible translate key extension chrome
Topic #1:
cat best climbing ninja ve photo taken belly merley kitten


## LDA

In [23]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [44]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

# no_features = 100
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 2

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1).fit(tfidf)
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics).fit(tf)

no_top_words = 10
print('Topic Modelling with NMF:')
display_topics(nmf, tfidf_feature_names, no_top_words)
print(' ---- ')
print('Topic Modelling with LDA')
display_topics(lda, tf_feature_names, no_top_words)

Topic Modelling with NMF:
Topic 0:
google feedback map app impressed incredible translate key extension chrome
Topic 1:
cat best climbing ninja ve photo taken belly merley kitten
 ---- 
Topic Modelling with LDA
Topic 0:
google smiley translate restaurant tab promoter eating face feedback kitty
Topic 1:
cat best taken merley belly kitten squooshy ve ninja climbing


