In [1]:
from nltk.corpus import brown

data = []

for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)

In [2]:
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)

500


In [3]:
print(data[0])



In [4]:
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
# Improved Lemmatization by POS

# Limited the maximum features to 2000 as 1000 seems to be very small

In [6]:
# Improve Lemmatization by POS

from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag, tokenize

wnl = WordNetLemmatizer()
data_lemmatized = []

def lem(doc):
    lemmas = []
    for word, tag in tokenize(doc):
        wntag = tag[0].lower()
        wntag = wnltag if wnltag in ['a','v','n','r'] else None
        lemma = wnl.lemmatize(word, wntag) if wntag else word
        lemmas.append(lemma)
    return ' '.join(lemmas)

for doc in data:
    data_lemmatized.append(doc)

    
NO_DOCUMENTS = len(data_lemmatized)
print(NO_DOCUMENTS)
print(data_lemmatized[0])

500


In [7]:
# Change the vectorizer to TFIDF for NMF
NUM_TOPICS = 10
tfidf_vectorizer = TfidfVectorizer(min_df= 10, max_df= 0.85, max_features= 2000, 
                                   lowercase= True , stop_words= 'english', token_pattern= '[a-zA-Z\-][a-zA-Z\-]{2,}')

In [8]:
tfidf_vectorizedData = tfidf_vectorizer.fit_transform(data_lemmatized)

In [9]:
tfidf_vectorizedData.shape

(500, 2000)

In [10]:
nmf = NMF(n_components= NUM_TOPICS, random_state=1,
          beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
          l1_ratio=.5)

In [11]:
nmf_Z = nmf.fit_transform(tfidf_vectorizedData)

In [12]:
print('\nDoc to Topic -> nmf_Z  - ', nmf_Z.shape )


Doc to Topic -> nmf_Z  -  (500, 10)


In [13]:
print('\n nmf_Z[0].argsort() - ' ,  nmf_Z[0].argsort()[::-1])


 nmf_Z[0].argsort() -  [2 1 0 4 3 9 8 7 6 5]


In [14]:
print('\n nmf_Z[0][nmf_Z[0].argsort()[::-1][0]] - ' , nmf_Z[0][nmf_Z[0].argsort()[::-1][0]])


 nmf_Z[0][nmf_Z[0].argsort()[::-1][0]] -  0.16240963363207295


In [15]:
print('\nnmf_Z[0] - ' ,  nmf_Z[0])


nmf_Z[0] -  [0.03823825 0.07861169 0.16240963 0.00655323 0.01225824 0.
 0.         0.         0.         0.        ]


In [16]:
# % Topic distribution of '0' th document

sum = 0
for i in range(10):
    print(nmf_Z[0][i] * 100)
    sum = sum + nmf_Z[0][i] * 100
    
print('sum - ',sum)

3.8238254897076676
7.861168942795979
16.240963363207296
0.6553230133084409
1.2258239951198175
0.0
0.0
0.0
0.0
0.0
sum -  29.8071048041392


In [17]:
print('\n Topic to word Matrix - nmf.components_.shape - ', nmf.components_.shape)


 Topic to word Matrix - nmf.components_.shape -  (10, 2000)


In [18]:
print('\n nmf.components_ \n' , nmf.components_)


 nmf.components_ 
 [[2.60876693e-02 2.33892059e-01 1.88616546e-04 ... 3.02404606e-02
  1.58648063e-01 2.10835039e-02]
 [3.52959848e-02 9.40702873e-02 5.44144596e-03 ... 4.63752937e-02
  0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 4.29910228e-02 ... 9.56466302e-02
  0.00000000e+00 0.00000000e+00]
 ...
 [0.00000000e+00 0.00000000e+00 2.15028641e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [1.71738843e-08 1.33788169e-05 8.34460247e-04 ... 0.00000000e+00
  1.13037930e-02 1.06300939e-02]
 [2.52904155e-02 2.05734432e-02 0.00000000e+00 ... 5.60692858e-05
  0.00000000e+00 0.00000000e+00]]


In [19]:
print('\nlen(tfidf_vectorizer.get_feature_names()) \n', len(tfidf_vectorizer.get_feature_names()))
print('\n tfidf_vectorizer.get_feature_names()[0:1000] \n' ,tfidf_vectorizer.get_feature_names()[0:1000])


len(tfidf_vectorizer.get_feature_names()) 
 2000

 tfidf_vectorizer.get_feature_names()[0:1000] 
 ['ability', 'able', 'abroad', 'absence', 'academic', 'accept', 'acceptance', 'accepted', 'according', 'account', 'achieve', 'achieved', 'achievement', 'act', 'acting', 'action', 'actions', 'active', 'activities', 'activity', 'actual', 'actually', 'add', 'added', 'addition', 'additional', 'address', 'adequate', 'administration', 'administrative', 'advance', 'advanced', 'advantage', 'advertising', 'advice', 'affairs', 'afraid', 'afternoon', 'age', 'agencies', 'agency', 'ages', 'ago', 'agree', 'agreed', 'agreement', 'ahead', 'aid', 'air', 'aircraft', 'alexander', 'alfred', 'alive', 'allow', 'allowed', 'america', 'american', 'americans', 'analysis', 'ancient', 'angeles', 'anger', 'angle', 'animal', 'animals', 'announced', 'annual', 'answer', 'answered', 'apart', 'apartment', 'apparent', 'apparently', 'appeal', 'appear', 'appearance', 'appeared', 'appears', 'application', 'applied', 'apply', '

In [20]:
for idx, topic in enumerate(nmf.components_[1]):
    print(idx, topic)
    


0 0.035295984832822806
1 0.09407028729771277
2 0.005441445960902056
3 0.007524144455133099
4 0.01282519607827925
5 0.05734241744797724
6 0.036080605784074064
7 0.039870637625853314
8 0.07715426782922666
9 0.05117444741091025
10 0.06214872104547797
11 0.05307220377724205
12 0.027303707940980585
13 0.08062082459211839
14 0.00024937977956815193
15 0.0992483963495393
16 0.07601743342856584
17 0.034095105542248295
18 0.031037846406040834
19 0.03649024428334281
20 0.03941158746536883
21 0.07555564309624242
22 0.029292456222146795
23 0.007787237394740569
24 0.0421306440697624
25 0.030715857745844292
26 1.8614306895622297e-07
27 0.043685569528310715
28 0.013512349797087316
29 0.034940704787976784
30 0.006805980573673294
31 0.025285623057443606
32 0.014422810708406073
33 0.008463484743086525
34 0.0
35 0.05227573975180822
36 0.0
37 0.0
38 0.06095092548365579
39 0.012278354889076307
40 0.012564716632866096
41 0.017877045689677632
42 0.029728589977376776
43 0.012980000014814133
44 6.13787369094710

In [21]:
#Finding Domniant word in topic 1
print(nmf.components_[1].argmax())

1993


In [22]:
print(nmf.components_[1][nmf.components_[1].argmax()])

0.25794568356492764


In [23]:
print('\n tfidf_vectorizer.get_feature_names()[nmf.components_[1].argmax()] \n' ,tfidf_vectorizer.get_feature_names()[nmf.components_[1].argmax()])


 tfidf_vectorizer.get_feature_names()[nmf.components_[1].argmax()] 
 years


In [24]:
# To print the topics and it s respective top 30 words 

top_n = 30

def print_topics (model, vectorizer, top_n ):   

    for idx , topic in enumerate(model.components_) :
        print('\nTopic ',idx)
        print([(vectorizer.get_feature_names()[i], topic[i])
               for i in topic.argsort()[:-top_n - 1: -1]])

print('\nNMF')
print_topics(nmf, tfidf_vectorizer , top_n)


NMF

Topic  0
[('said', 0.3670118896800908), ('come', 0.366219212923729), ('man', 0.3430002326187558), ('did', 0.3402945215059188), ('way', 0.33787658416673394), ('things', 0.3257208968683075), ('make', 0.32034265233610454), ('like', 0.31832435370333856), ('went', 0.3107545752619084), ('say', 0.30544772155125044), ('think', 0.30038421795964987), ('thought', 0.28073591059175035), ('know', 0.2768624430586715), ('told', 0.27643955022782046), ('took', 0.27190083626879147), ('turned', 0.2692866686614532), ('just', 0.2684466758232747), ('came', 0.26034271037761847), ('little', 0.25712857494088714), ('right', 0.25635218401108734), ('thing', 0.25067213137050387), ('let', 0.2481972090708356), ('good', 0.24777521463631288), ('looked', 0.24583531032187877), ('look', 0.23899613354596397), ('able', 0.23389205896807222), ('face', 0.22977632487278038), ('long', 0.22631831365368332), ('sure', 0.22620526030114801), ('day', 0.22134970314206348)]

Topic  1
[('years', 0.25794568356492764), ('general', 0.

In [32]:
n_topwords = 30
def print_topwords(model,vectorizer, n_topwords):
    for topic_index, topic in enumerate(model.components_):
        message = 'Topic# %d '  %topic_index
        message += " ".join([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-n_topwords -1:-1]])
        print(message)
        print('\n')   
    
print_topwords(nmf,tfidf_vectorizer, 30)

Topic# 0 said come man did way things make like went say think thought know told took turned just came little right thing let good looked look able face long sure day


Topic# 1 years general great does important fact use group given individual areas new war world example sense need american purpose social work taken states result life ways problem nature means state


Topic# 2 business government state home year national states called expected president wanted members new john secretary washington service administration house city district money day month committee county paid office months plans


Topic# 3 various use work usually water similar high available used make values using state period number table set method production unit possible designed single large range year necessary worked rate value


Topic# 4 young years work written university school brought home mrs family stage youth girl wrote member age art week successful fine brown studied child treatment excellent father 