In [81]:
import numpy as np

from elasticsearch import Elasticsearch
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler


DATA_HOME = '/Users/sahil/scikit_learn_data/'
es = Elasticsearch()

In [82]:
def print_top_words(model, feature_names, n_top_words):
    retval = []
    for topic_idx, topic in enumerate(model.components_):
        retval.append((topic_idx, [(feature_names[i], round(topic[i], 3)) for i in topic.argsort()[:-n_top_words - 1:-1]]))
    return retval

In [83]:
def compare_topic_extractions(n_components, data, feature_names, n_top_words=20):
    print('Running NMF..')
    nmf = NMF(n_components=n_components, alpha=0.1, random_state=123, verbose=True)
    nmf.fit_transform(data)
    print('..Done')
    print()
    
    print(f'{n_top_words} topics in NMF model:')
    nmf_words = print_top_words(nmf, feature_names, n_top_words)
    for topic_idx, words in nmf_words:
        print(f'Topic {topic_idx}: {words[:n_top_words]}')
    print('-'*15)
    
    print('Running LDA..')
    lda = LatentDirichletAllocation(n_components=n_components, learning_method='online', random_state=123, verbose=True)
    lda.fit_transform(data)
    print('..Done')
    print()
    
    print(f'{n_top_words} topics in LDA model:')
    lda_words = print_top_words(lda, feature_names, n_top_words)
    for topic_idx, words in lda_words:
        print(f'Topic {topic_idx}: {words[:n_top_words]}')
    print('-'*15)
    
    return nmf, lda
    

# 20 News Groups

In [78]:
ngdata = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'), data_home=DATA_HOME)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
ngvectors = tfidf_vectorizer.fit_transform(ngdata.data)
ng_features = tfidf_vectorizer.get_feature_names()

### K = 10

In [84]:
k = 10
ng_nmf_10, ng_lda_10 = compare_topic_extractions(k, ngvectors, ng_features)

Running NMF..
violation: 1.0
violation: 0.4094377772033683
violation: 0.21634813822831137
violation: 0.13511344253961044
violation: 0.09341469840196132
violation: 0.06800666537353346
violation: 0.05172048876525522
violation: 0.040649425974258656
violation: 0.0331324022002571
violation: 0.027831004700740954
violation: 0.02395558543936068
violation: 0.020974604333010884
violation: 0.01864641329703627
violation: 0.016778582701835063
violation: 0.015237331253051677
violation: 0.013941670454273173
violation: 0.01284804916261883
violation: 0.011913257269055268
violation: 0.01110097125810941
violation: 0.010379177342917928
violation: 0.009741092970612403
violation: 0.00915863803889246
violation: 0.00862165399836605
violation: 0.00813903159029183
violation: 0.0077013202191733845
violation: 0.007296001005007087
violation: 0.006929464679279133
violation: 0.006586689886452545
violation: 0.006269670558154345
violation: 0.005976852895896169
violation: 0.005707140715130923
violation: 0.0054575598371

iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
..Done

20 topics in LDA model:
Topic 0: [('8051', 3.231), ('iig', 1.753), ('lazarus', 1.549), ('itoh', 1.33), ('486dx33', 1.237), ('hijaak', 1.077), ('reconsidered', 1.02), ('greene', 0.964), ('4778', 0.909), ('app_context', 0.898), ('okidata', 0.82), ('sandiego', 0.755), ('nettles', 0.737), ('graig', 0.737), ('rigidly', 0.681), ('vesterman', 0.596), ('grfwk61t', 0.567), ('3c503', 0.482), ('sdsu', 0.379), ('2406', 0.1)]
Topic 1: [('moa', 2.905), ('hga', 2.271), ('smileys', 1.983), ('bake', 1.862), ('timmons', 1.684), ('2a42dubinski', 1.189), ('rathole', 1.071), ('nanaimo', 0.966), ('dubinski', 0.941), ('brag', 0.916), ('waldbronn', 0.835), ('whirrr', 0.796), ('1993e', 0.783), ('callsign', 0.686), ('swindon', 0.657), ('autodoubler', 0.605), ('toutatis', 0.528), ('wad', 0.517), ('stac', 0.503), ('knowles', 0.487)]
Topic 2: [('thanks', 101.446), ('use', 83.423), ('like', 82.941), ('know', 81.829), ('w

### K = 20

In [85]:
k = 20
ng_nmf_20, ng_lda_20 = compare_topic_extractions(k, ngvectors, ng_features)

Running NMF..
violation: 1.0
violation: 0.3619048565505237
violation: 0.2256736763188727
violation: 0.1663961912735973
violation: 0.1248422378161335
violation: 0.09777907168857393
violation: 0.08137143405045415
violation: 0.07103411795321271
violation: 0.06336835329914628
violation: 0.0554550294602707
violation: 0.046941801406694406
violation: 0.038767501224416946
violation: 0.03163273252894002
violation: 0.025844679580130602
violation: 0.02140093341854373
violation: 0.018008081396564665
violation: 0.015437315330661268
violation: 0.013417965958221176
violation: 0.01180753934819019
violation: 0.010476853650845965
violation: 0.009392219820886337
violation: 0.008493768080296537
violation: 0.007733283897069457
violation: 0.00708151179257426
violation: 0.006511971058231166
violation: 0.006011019237399154
violation: 0.005564342367455659
violation: 0.00516228297480649
violation: 0.004796645103460938
violation: 0.004466388386619623
violation: 0.004160364559372561
violation: 0.00388040208925118

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
..Done

20 topics in LDA model:
Topic 0: [('60ns', 1.419), ('vcc', 1.252), ('832', 1.121), ('rdd', 1.081), ('misfire', 0.988), ('arbor', 0.977), ('6682', 0.86), ('4778', 0.859), ('safeties', 0.798), ('2300', 0.593), ('3959', 0.56), ('4173', 0.56), ('hal9k', 0.56), ('winqwk', 0.56), ('95d', 0.56), ('kmail', 0.56), ('vesterman', 0.546), ('486dx3', 0.535), ('663', 0.532), ('shafting', 0.531)]
Topic 1: [('geb', 12.759), ('shameful', 12.662), ('cadre', 12.543), ('pitt', 12.467), ('chastity', 12.301), ('n3jxp', 12.301), ('dsl', 12.297), ('intellect', 12.156), ('skepticism', 12.127), ('surrender', 12.121), ('gordon', 11.827), ('banks', 11.715), ('soon', 8.528), ('edu', 4.254), ('chunks', 1.835), ('lotsa', 1

### K = 50

In [86]:
k = 50
ng_nmf_50, ng_lda_50 = compare_topic_extractions(k, ngvectors, ng_features)

Running NMF..
violation: 1.0
violation: 0.3031011073095886
violation: 0.21211944173626482
violation: 0.14414992411579605
violation: 0.10324358748040958
violation: 0.07870773664693631
violation: 0.061471162037501276
violation: 0.048224529683839897
violation: 0.038293036645742864
violation: 0.03177850513419756
violation: 0.02756307255484464
violation: 0.02480244749038958
violation: 0.02299886691310039
violation: 0.0214406830734551
violation: 0.019892964699205845
violation: 0.018255831639582082
violation: 0.016579518515820877
violation: 0.014904652659671013
violation: 0.013338936933478068
violation: 0.012036424307706803
violation: 0.010993390384804647
violation: 0.010075866260823187
violation: 0.009229067561483097
violation: 0.008454596908585297
violation: 0.007840610798384589
violation: 0.007336330175066147
violation: 0.0069074443337469255
violation: 0.006549293408236972
violation: 0.006255400432843792
violation: 0.006031125639365653
violation: 0.005877979512069319
violation: 0.005754263

Topic 21: [('just', 1.872), ('wondering', 0.188), ('oh', 0.13), ('wanted', 0.119), ('ll', 0.097), ('thought', 0.097), ('yeah', 0.094), ('work', 0.09), ('group', 0.08), ('little', 0.08), ('way', 0.079), ('mean', 0.077), ('kidding', 0.071), ('maybe', 0.067), ('listen', 0.066), ('real', 0.062), ('curious', 0.062), ('say', 0.059), ('right', 0.059), ('tell', 0.059)]
Topic 22: [('looking', 1.519), ('information', 0.509), ('appreciated', 0.232), ('hello', 0.21), ('email', 0.147), ('help', 0.143), ('greatly', 0.125), ('book', 0.107), ('hi', 0.105), ('thank', 0.101), ('buy', 0.094), ('advice', 0.085), ('algorithm', 0.071), ('viewer', 0.068), ('used', 0.065), ('spend', 0.064), ('andreas', 0.064), ('anybody', 0.062), ('subject', 0.06), ('polygon', 0.059)]
Topic 23: [('don', 1.37), ('know', 1.312), ('want', 0.255), ('let', 0.198), ('going', 0.111), ('sure', 0.11), ('oh', 0.099), ('really', 0.087), ('doesn', 0.084), ('right', 0.079), ('sorry', 0.075), ('maybe', 0.073), ('tell', 0.065), ('read', 0.0

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
..Done

20 topics in LDA model:
Topic 0: [('deficits', 0.926), ('thingy', 0.655), ('nilsson', 0.58), ('reinforcement', 0.485), ('mockery', 0.262), ('brewers', 0.02), ('umm', 0.02), ('balanced', 0.02), ('sound', 0.02), ('bullpen', 0.02), ('convert', 0.02), ('economists', 0.02), ('austin', 0.02), ('orosco', 0.02), ('yesterday', 0.02), ('game', 0.02), ('total', 0.02), ('ouch', 0.02), ('unbalanced', 0.02), ('lloyd', 0.02)]
Topic 1: [('sphere', 5.715), ('shrink', 2.768), ('buggy', 2.686), ('kk', 2.678), ('tt', 2.373), ('mc', 2.201), ('emulator', 2.114), ('lb', 2.106), ('sq', 2.031), ('cx', 1.864), ('ww', 1.75), ('immaculate', 1.727), ('70ns', 1.699), ('u2', 1.623), ('cn', 1.615), ('sy', 1.583), ('1m', 1.5

## Update ES with top words in document

In [133]:
def ng_topic_tuples(topics, n_top_words=5):
    for topic_idx, words in topics:
        assert len(words) >= n_top_words, '`n_top_words` cannot be greater than `words-per-topic` for any topic'
        yield {
            'create': {
                '_index': 'topic', 
                '_type': 'document',
                '_id': f'20ng-{topic_idx}'
            }
        }
        
        yield {
            'top_words': ', '.join([w for w, p in words[:n_top_words]]),
            'top_words_probab': ', '.join([str(round(p, 2)) for w, p in words[:n_top_words]])
        }

es_topic_res = []
es_topic_res.append(es.bulk(body=ng_topic_tuples(print_top_words(ng_lda_20, ng_features, n_top_words=5))))

In [134]:
np.any(np.array([val['errors'] for val in es_topic_res]))

False

In [175]:
es_ng_result = []
step = 1000
ng_doc_topics = []
for i in range(len(ngdata.data)):
    _lda = LatentDirichletAllocation(n_components=1, learning_method='online', random_state=123)
    _lda.fit_transform(ngvectors[i,:])
    
    _words = print_top_words(_lda, ng_features, n_top_words=5)
    ng_doc_topics.append(_words)
    
    _body = ', '.join([str(w)+':'+str(round(p, 2)) for w, p in _words[0][1][:5]])
    es_ng_result.append(
        es.update(**{
            'index': '20ng',
            'doc_type': 'document',
            'id': i,
            'body': {
               'doc': {
                   'doc_topics': _body 
               },
                'detect_noop': False
            }
        })
    )

# def ng_update_tuples(from_index, to_index, document_topics, n_top_words=5):
#     for i in range(from_index, to_index):
#         _words = document_topics[i][0][1]
#         assert len(_words) >= n_top_words, '`n_top_words` cannot be greater than `words-per-topic` for any topic'
#         yield {
#             'index': {
#                 '_index': '20ng', 
#                 '_type': 'document',
#                 '_id': i
#             }
#         }
        
#         yield {
#             'doc_topics': ', '.join([str(w)+':'+str(round(p, 2)) for w, p in _words[:n_top_words]])
#         }

# es_ng_result = []
# step = 1000
# doc_topics = []
# for i in range(len(ngdata.data)):
#     _lda = LatentDirichletAllocation(n_components=1, learning_method='online', random_state=123)
#     _lda.fit_transform(ngvectors[i,:])
    
#     doc_topics.append(print_top_words(_lda, ng_features, n_top_words=5))
    
#     if (i>0 and i%step == 0) or i==11313:
#         print(f'Updating ES for range {max(0, i-1000)}-{min(i, len(ngdata.data))}')
#         es_ng_result.append(es.bulk(body=ng_update_tuples(max(0, i-1000), min(i, len(ngdata.data)), doc_topics)))
# #         print(list(ng_update_tuples(max(0, i-1000), min(i, len(ngdata.data)), doc_topics)))
    

  perword_bound = bound / word_cnt


In [177]:
np.all(np.array([val['result']=='updated' for val in es_ng_result]))

True

# DUC

In [118]:
import os

DUC_DIR = DATA_HOME+'Modified-DUC2001/'

def extract_text(data):
    start = data.find('[TEXT]')
    end = data.find('[/TEXT]')
    if start == -1:
        start = data.find('<TEXT>')
    if end == -1:
        end = data.find('</TEXT>')
    if start == -1:
        start = 0
    else:
        start += 6
    if end == -1:
        end = len(data)
    
    return data[start: end].strip()

def extract_summary(data):
    return data.replace('Abstract:', '').replace('Introduction:', '').strip()

duc_data = []    
for i, filename in enumerate(os.listdir(DUC_DIR)):
    if filename == '.DS_Store' or os.path.isdir(DUC_DIR+filename):
        continue

    body = ''
    summary = ''

    with open(DUC_DIR+filename, 'r', encoding='utf-8', errors='ignore') as file:
        body = extract_text(file.read())

    summary_filename = DUC_DIR+'Summaries/'+filename.lower()+'.txt'
    if os.path.isfile(summary_filename):
        with open(summary_filename, 'r', encoding='utf-8', errors='ignore') as summary_file:
            summary = extract_summary(summary_file.read())

    duc_data.append({
        'summary': summary,
        'body': body
    })

In [119]:
duc_tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
ducvectors = duc_tfidf_vectorizer.fit_transform([d['body'] for d in duc_data])
duc_features = duc_tfidf_vectorizer.get_feature_names()

### K = 10

In [120]:
k=10
duc_nmf_10, duc_lda_10 = compare_topic_extractions(k, ducvectors, duc_features)

Running NMF..
violation: 1.0
violation: 0.4941614913256327
violation: 0.25805979654305555
violation: 0.14954393122993132
violation: 0.10433380733970742
violation: 0.07896788683458769
violation: 0.06475328436426735
violation: 0.055532194271144
violation: 0.04982353074493449
violation: 0.046123997898401874
violation: 0.04315600059741012
violation: 0.04117977479073703
violation: 0.039707535474938625
violation: 0.036985140235904025
violation: 0.03371914436137422
violation: 0.030861199680811024
violation: 0.0275869537712388
violation: 0.023491688956102482
violation: 0.01965357326047011
violation: 0.016183175302314626
violation: 0.013207708758568117
violation: 0.010791895038564465
violation: 0.008897277649550223
violation: 0.00739221861904283
violation: 0.006213812756544141
violation: 0.005283056885926733
violation: 0.004561043805732744
violation: 0.004008394538768784
violation: 0.003573357365421207
violation: 0.003230098382776396
violation: 0.0029502205669115975
violation: 0.002729684659955

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
..Done

20 topics in LDA model:
Topic 0: [('said', 15.36), ('mr', 10.157), ('police', 7.916), ('thomas', 7.001), ('welfare', 6.835), ('people', 6.599), ('says', 6.417), ('year', 6.11), ('term', 5.894), ('slovenia', 5.319), ('limits', 5.151), ('new', 5.025), ('state', 4.804), ('congress', 4.643), ('house', 4.588), ('world', 4.543), ('johnson', 4.374), ('bank', 4.332), ('years', 4.326), ('nafta', 4.168)]
Topic 1: [('eclipse', 6.084), ('sun', 1.061), ('hawaii', 0.814), ('moon', 0.725), ('glasses', 0.711), ('solar', 0.664), ('baja', 0.583), ('viewing', 0.579), ('telescope', 0.561), ('astronomers', 0.51), ('telescopes', 0.484), ('fog', 0.48), ('sunny', 0.48), ('partial', 0.456), ('film', 0.453), ('tablet'

### K = 20

In [121]:
k=20
duc_nmf_20, duc_lda_20 = compare_topic_extractions(k, ducvectors, duc_features)

Running NMF..
violation: 1.0
violation: 0.42341421159125886
violation: 0.23073346826320204
violation: 0.13286677925536422
violation: 0.09923722186601575
violation: 0.06524721543482238
violation: 0.036792812341251306
violation: 0.023586276012997887
violation: 0.01731693378150083
violation: 0.013999612252392695
violation: 0.011908056696296198
violation: 0.010407379638394065
violation: 0.009295601061915052
violation: 0.008446005518956379
violation: 0.007779872648708974
violation: 0.007267234686043197
violation: 0.006878807066669006
violation: 0.006587844956484121
violation: 0.00626255517880427
violation: 0.006142092048451576
violation: 0.006163991591088787
violation: 0.006264912865205653
violation: 0.0063898386530388505
violation: 0.006622587041287505
violation: 0.006984719920232909
violation: 0.007658337117436398
violation: 0.00850569141401067
violation: 0.009467397152685787
violation: 0.01042662100729127
violation: 0.011203667487836636
violation: 0.011526880724959812
violation: 0.011039

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
..Done

20 topics in LDA model:
Topic 0: [('crash', 0.136), ('shear', 0.128), ('said', 0.125), ('park', 0.123), ('fires', 0.12), ('yellowstone', 0.119), ('nafta', 0.115), ('wind', 0.112), ('mr', 0.108), ('air', 0.108), ('travel', 0.104), ('assassination', 0.104), ('flight', 0.103), ('forest', 0.103), ('united', 0.102), ('aircraft', 0.102), ('limits', 0.101), ('87', 0.1), ('jet', 0.099), ('goldwin', 0.099)]
Topic 1: [('confirming', 0.089), ('amendment', 0.088), ('approach', 0.087), ('robbed', 0.087), ('eclipse', 0.087), ('poverty', 0.086), ('influence', 0.086), ('wildlife', 0.086), ('numerous', 0.086), ('gradually', 0.086), ('allergy', 0.086), ('diamond', 0.086), ('estate', 0.085), ('feet', 0.085), ('

### K = 50

In [122]:
k=50
duc_nmf_50, duc_lda_50 = compare_topic_extractions(k, ducvectors, duc_features)

Running NMF..
violation: 1.0
violation: 0.2793603025927677
violation: 0.14351476424102855
violation: 0.0860852283551121
violation: 0.06400954506254049
violation: 0.04912046171054808
violation: 0.03964633217412565
violation: 0.03425868282112444
violation: 0.03188166867787595
violation: 0.029623490363236256
violation: 0.024825284962492363
violation: 0.02005803640776928
violation: 0.01686200846195875
violation: 0.014537010681485798
violation: 0.012593031399499502
violation: 0.010851614858214783
violation: 0.009292053972959907
violation: 0.007862781475185593
violation: 0.006594378843023554
violation: 0.005556497461043975
violation: 0.0047553676834030814
violation: 0.004100450386134929
violation: 0.003604229864008492
violation: 0.0032133810421462994
violation: 0.002889571548599148
violation: 0.002617027224096635
violation: 0.0023865954378855052
violation: 0.002189279983201222
violation: 0.0020163754470704748
violation: 0.0018655434182196409
violation: 0.0017313112497283512
violation: 0.0016

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10
..Done

20 topics in LDA model:
Topic 0: [('thomas', 4.048), ('court', 1.136), ('supreme', 0.846), ('nomination', 0.78), ('clarence', 0.759), ('yale', 0.587), ('nominee', 0.547), ('affirmative', 0.537), ('discrimination', 0.527), ('grandfather', 0.499), ('judiciary', 0.492), ('danforth', 0.481), ('confirmation', 0.477), ('grandparents', 0.472), ('circuit', 0.456), ('appeals', 0.437), ('views', 0.436), ('senate', 0.429), ('republican', 0.418), ('judge', 0.411)]
Topic 1: [('exxon', 0.074), ('oil', 0.066), ('amendment', 0.065), ('diamond', 0.062), ('cleanup', 0.06), ('confirming', 0.06), ('said', 0.059), ('wildlife', 0.059), ('beers', 0.059), ('robbed', 0.058), ('eclipse', 0.057), ('feet', 0.057), ('val

## Update ES with top words in document

In [131]:
def duc_topic_tuples(topics, n_top_words=5):
    for topic_idx, words in topics:
        assert len(words) >= n_top_words, '`n_top_words` cannot be greater than `words-per-topic` for any topic'
        yield {
            'create': {
                '_index': 'topic', 
                '_type': 'document',
                '_id': f'duc-{topic_idx}'
            }
        }
        
        yield {
            'top_words': ', '.join([w for w, p in words[:n_top_words]]),
            'top_words_probab': ', '.join([str(round(p, 2)) for w, p in words[:n_top_words]])
        }

es_duc_topic_res = []
es_duc_topic_res.append(es.bulk(body=duc_topic_tuples(print_top_words(duc_lda_20, duc_features, n_top_words=5))))

In [132]:
np.any(np.array([val['errors'] for val in es_duc_topic_res]))

False

In [170]:
es_duc_result = []
step = 1000
duc_doc_topics = []
for i in range(len(duc_data)):
    _lda = LatentDirichletAllocation(n_components=1, learning_method='online', random_state=123)
    _lda.fit_transform(ducvectors[i,:])
    
    _words = print_top_words(_lda, duc_features, n_top_words=5)
    duc_doc_topics.append(_words)
    
    _body = ', '.join([str(w)+':'+str(round(p, 2)) for w, p in _words[0][1][:5]])
    es_duc_result.append(
        es.update(**{
            'index': 'duc',
            'doc_type': 'document',
            'id': i,
            'body': {
               'doc': {
                   'doc_topics': _body 
               },
            'detect_noop': False
            }
        })
    )

In [174]:
np.all(np.array([val['result']=='updated' for val in es_duc_result]))

True