In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from elasticsearch import Elasticsearch
import os
from time import time
import re
from bs4 import BeautifulSoup
import numpy as np

# Indexing Newsgroups in elasticsearch

In [186]:
es = Elasticsearch(['localhost'],
    http_auth=('elastic', 'elastic'),
    scheme="http",
    port=9200,)

In [190]:
ng = fetch_20newsgroups(subset='all',remove=('headers'))
ng_X = ng.data
ng_y = ng.target

In [191]:
ngdocs={}
for i in range(len(ng_X)):
    ngdocs[i]=ng_X[i].replace('\n','')

In [192]:
for i in ngdocs:
    '''es.index(index="newsgroups-index", doc_type='newsgroups', id=i, body={
    'doc_id': i,
    'doc_text': ngdocs[i]})'''
    es.update(index="newsgroups-index", doc_type='newsgroups', id=i, 
              body={"doc": {'doc_id': i,'doc_text': ngdocs[i]}})

In [166]:
es.indices.refresh(index="newsgroups-index")
res = es.search(index="newsgroups-index", body={"query": {"match_all": {}}})
print("Got %d Hits" % res['hits']['total'])

Got 18846 Hits


# Indexing DUC dataset in elasticsearch

In [164]:
ducpath="/Users/sasankauppu/Desktop/Data Mining CS6220/DataMining/DUC2001/"
ducdocs={}
ducsum={}
for f in os.listdir(ducpath+"raw_data"):
    if f!=".DS_Store":
        ducdocs[f.lower()]=BeautifulSoup(open(ducpath+'raw_data/'+f,'r').read(), "lxml").find("text").text.replace('\n','')
    
for f in os.listdir(ducpath+"Summaries"):
    if f!=".DS_Store":
        ducsum[f[:-4].lower()]=open(ducpath+'Summaries/'+f,'r').read()

In [165]:
for i in ducdocs:
    if i not in ducsum:
        es.index(index="duc-index", doc_type='ducdocs', id=i, body={
        'doc_id': i,
        'doc_text': ducdocs[i],
        'gold_summary':""})
    else:
        es.index(index="duc-index", doc_type='ducdocs', id=i, body={
        'doc_id': i,
        'doc_text': ducdocs[i],
        'gold_summary':ducsum[i]})

In [167]:
es.indices.refresh(index="duc-index")
res = es.search(index="duc-index", body={"query": {"match_all": {}}})
print("Got %d Hits" % res['hits']['total'])

Got 308 Hits


# NMF for Newsgroups

In [48]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print message

ng = fetch_20newsgroups(subset='all',remove=('headers','footers'))
#ng_X=([re.sub("[^a-zA-Z ]",""," ".join(k.replace('\n','').lower() for k in i.split())) for i in ng.data])


# Use tf-idf features for NMF.
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=3,max_features=10000,stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(ng_X)


# Use tf (raw term count) features for LDA.
tf_vectorizer = CountVectorizer(max_df=0.5, min_df=3,max_features=10000,stop_words='english')
tf = tf_vectorizer.fit_transform(ng_X)

In [50]:
nmf = NMF(n_components=50, random_state=0, beta_loss='kullback-leibler', 
          solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)

print("Topics in NMF model:")
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), 20)

Topics in NMF model:
Topic #0: think way thing dont just things make say really know thats probably time point people article want youre like right
Topic #1: windows using use window thanks program run work software dos running want set unix xr server file sun user help
Topic #2: year game team play games season win players years baseball teams won player article league hes night watching fan runs
Topic #3: drive hard disk scsi controller drives mb floppy ide computer external internal machine work hd problems meg card tape ram
Topic #4: god world gods believe jesus man bible faith sin life hell say lord heaven says does love christ words christians
Topic #5: thanks email know looking hi advance interested info help information im send reply wondering mail hello post thank appreciated does
Topic #6: key chip keys clipper encryption phone security use secure nsa technology secret government escrow scheme technical systems voice used public
Topic #7: war israel state world jews policy pe

In [51]:
nmf = NMF(n_components=20, random_state=0, beta_loss='kullback-leibler', 
          solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)

print("Topics in NMF model:")
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), 20)

Topics in NMF model:
Topic #0: think way just article say make like time dont know right really people thats good things thing sure want point
Topic #1: use windows using software program thanks work run problem window need version set help running file like dos know want
Topic #2: year team game play season win games players article teams years think baseball player hockey league won time series played
Topic #3: drive hard disk scsi mac works controller mb drives power problems problem new floppy just internal work ide computer external
Topic #4: god jesus christian word bible christians world man christ believe gods faith life love sin church say christianity paul lord
Topic #5: thanks know email looking like im does need help hi information info wondering advance interested send used good post want
Topic #6: key use number phone chip keys clipper encryption data security government technology secure message systems public voice nsa secret order
Topic #7: world war people state years

In [52]:
nmf = NMF(n_components=10, random_state=0, beta_loss='kullback-leibler', 
          solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf)

print("Topics in NMF model:")
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), 20)

Topics in NMF model:
Topic #0: article think just way dont time people like say make want right use things know sure thing really thats im
Topic #1: use windows using thanks program software need know work like run help does file want version window problem used im
Topic #2: year article team game think years just play time like games season win players im good teams night baseball mike
Topic #3: drive hard works problem power disk scsi just new mb drives controller work make mac problems switch floppy computer time
Topic #4: god wrote know jesus does says say word world christian believe true bible man christians paul read christ gods faith
Topic #5: thanks know like email im looking does want need used good post information info wondering interested help id car ive
Topic #6: key number use phone chip government public keys clipper encryption information data security message read systems david law technology secure
Topic #7: people government state war world rights did said killed is

# LDA for Newsgroups

In [53]:
lda = LatentDirichletAllocation(n_components=50, max_iter=50, 
      learning_method='online', learning_offset=50., random_state=0).fit(tf)

print("\nTopics in LDA model:")
print_top_words(lda, tf_vectorizer.get_feature_names(), 20)


Topics in LDA model:
Topic #0: article just year game think like good time dont im better got did years know really going hit players didnt
Topic #1: water energy power high battery heat low air use used effect temperature hot cold cause light nuclear weight ground actually
Topic #2: price email sale offer new sell condition shipping interested asking cd original best sold used excellent tape old selling reply
Topic #3: vs black st white green art cover appears new app copies silver duo edition annual miller wolverine comics hulk man
Topic #4: mac apple version bit machines simms color lc quadra fpu hardware ram centris machine macs does upgrade macintosh software bbs
Topic #5: key encryption chip keys clipper government use security privacy algorithm des phone nsa secure technology used escrow law data message
Topic #6: sound robert jim channel craig ra richard douglas receiver article interference channels nixon tempest holland weiss apryumaacnscolostateedu context operational bye
T

In [55]:
lda = LatentDirichletAllocation(n_components=20, max_iter=50, 
      learning_method='online', learning_offset=50., random_state=0).fit(tf)

print("\nTopics in LDA model:")
print_top_words(lda, tf_vectorizer.get_feature_names(), 20)


Topics in LDA model:
Topic #0: article just dont like think know im good time people really way going want make right ive say better sure
Topic #1: power db water ground energy battery heat light use nuclear supply air temperature current unit cold hot used high low
Topic #2: game team games year play players season hockey league win baseball player teams period nhl fans vs st won runs
Topic #3: maxaxaxaxaxaxaxaxaxaxaxaxaxaxax printer st print art fpu rider appears new cover vs laser duo copies printing wolverine hp comics printers hulk
Topic #4: people government right state law rights article public states control laws country weapons citizens crime does society case legal use
Topic #5: window mouse application widget motif manager set keyboard server using use xterm problem display openwindows shell button open widgets xr
Topic #6: president mr clinton think stephanopoulos ms money going house said tax know administration health jobs did white congress private care
Topic #7: armeni

In [54]:
lda = LatentDirichletAllocation(n_components=10, max_iter=50, 
      learning_method='online', learning_offset=50., random_state=0).fit(tf)

print("\nTopics in LDA model:")
print_top_words(lda, tf_vectorizer.get_feature_names(), 20)


Topics in LDA model:
Topic #0: game team article year games think good like just play players dont season time hockey better baseball know did im
Topic #1: people government said article israel right gun did president state mr jews rights war law children police fbi killed just
Topic #2: maxaxaxaxaxaxaxaxaxaxaxaxaxaxax god jesus church bible christ christian christians man sin lord men new gods homosexual father love st faith paul
Topic #3: windows use drive software know does card image bit like thanks dos disk using im scsi problem need work graphics
Topic #4: people dont article think just know does say like believe way im good make point question god time things right
Topic #5: file window use program list email server files code send application available ftp set subject using mail line information address
Topic #6: article just dont like know time think people im years going money good make work use health does way problem
Topic #7: armenian armenians turkish university new apri

## Storing LDA topics and updating elastic search index for NG

In [89]:
topic_components=lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]

for topic_idx, topic in enumerate(topic_components):  
    top_words=[]
    for i in topic.argsort()[:-11:-1]:
        top_words.append({"word":tf_vectorizer.get_feature_names()[i],"probability":round(topic[i],4)})

    es.index(index="newsgroups-topic-index", doc_type='ngtopics', id=topic_idx, 
             body={'topic_id': topic_idx,'top_words': top_words})

In [112]:
ldatf = lda.transform(tf)
for idx, dt in enumerate(ldatf):
    doc_topics=[]
    for i in dt.argsort()[:-6:-1]:
        doc_topics.append({"topic":i,"probability":round(dt[i],4)})
        
    es.update(index="newsgroups-index", doc_type='newsgroups', id=idx, 
              body={"doc": {'doc_id': idx,"doc_topics": doc_topics}})

# NMF for DUC

In [56]:
duc_X = ducdocs.values()
duc_X = [BeautifulSoup(i, "lxml").text for i in duc_X]

# Use tf-idf features for NMF.
tfidf_vectorizer_duc = TfidfVectorizer(max_df=0.5, min_df=3,stop_words='english')
tfidf_duc = tfidf_vectorizer_duc.fit_transform(duc_X)


# Use tf (raw term count) features for LDA.
tf_vectorizer_duc = CountVectorizer(max_df=0.5, min_df=3,stop_words='english')
tf_duc = tf_vectorizer_duc.fit_transform(duc_X)

In [57]:
nmf = NMF(n_components=50, random_state=0, beta_loss='kullback-leibler', 
          solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf_duc)

print("Topics in NMF model:")
print_top_words(nmf, tfidf_vectorizer_duc.get_feature_names(), 20)

Topics in NMF model:
Topic #0: state news say make times long states 10 united million president going government says 000 national way public right just
Topic #1: associated area column captain animals clean areas april alaska bay allow avoid affected tanker did argo cleanup reef spill sound
Topic #2: national miami season mph tropical mississippi winds nov storm hurricane storms coast forecasters sheets water nr texas reach predicted center
Topic #3: press 11 section earth sea thursday rush usa road seeing eclipse seeking regarded san solar seen puts nature set final
Topic #4: 100 1972 79 canadian ap abuse ben 1stld medal 27 track gold 200 athletes accepted olympics 88 johnson seoul 1986
Topic #5: national nr service major fires spokeswoman officials firefighters monday million press ap nearly near officer weather forest resources park just
Topic #6: london like times given mad government page scientists western imposed spongiform transmitted ministry meet countries disease ft having

In [58]:
nmf = NMF(n_components=20, random_state=0, beta_loss='kullback-leibler', 
          solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf_duc)

print("Topics in NMF model:")
print_top_words(nmf, tfidf_vectorizer_duc.get_feature_names(), 20)

Topics in NMF model:
Topic #0: state states say washington times just says united long work president news 000 report don likely page public government like
Topic #1: associated area column captain april animals clean areas bay did alaska allow affected avoid tanker workers argo site cleanup sound
Topic #2: national storms storm south recorded miami mph reported season west mississippi texas water winds hurricane predictions nov professor nr weather
Topic #3: 11 usa san 10 press earth sj 1991 thursday road section mercury sj1 west nature box sjmn91 rush sea passed
Topic #4: 100 1972 79 desk ap canadian abuse column home ben 27 athletes 1stld medal coach track gold 1989 200 accepted
Topic #5: national nr major near officer service ap firefighters fires trees monday spokeswoman means grafs weather acres moisture burned worst southern
Topic #6: page london times financial period like countries european mad industries ft western spongiform increase official mr disease scientists likely gov

In [59]:
nmf = NMF(n_components=10, random_state=0, beta_loss='kullback-leibler', 
          solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf_duc)

print("Topics in NMF model:")
print_top_words(nmf, tfidf_vectorizer_duc.get_feature_names(), 20)

Topics in NMF model:
Topic #0: state states page united times says washington say work house president just public news power past government way make support
Topic #1: associated area column oil april captain tanker animals areas clean million alaska bay affected did ship allow william avoid company
Topic #2: national south storms reported winds recorded storm west miami season water texas nov coast mph press north nr mississippi predictions
Topic #3: 10 press san 11 passed shadow thursday writer usa seen road good earth long minutes won sj1 fourth turn 20
Topic #4: 100 home column 1972 ap desk 1989 27 commission 79 finish sports abuse edition gold took away canadian world angeles
Topic #5: nr near officials national ap press major pm spokesman miles writethru officer crash monday military writer service plane grafs ld
Topic #6: london page financial high mr like times countries million government lack mad world official 16 european increase crop half ft
Topic #7: report police govern

# LDA for DUC

In [60]:
lda_duc = LatentDirichletAllocation(n_components=50, max_iter=5, 
      learning_method='online', learning_offset=50., random_state=0).fit(tf_duc)

print("\nTopics in LDA model:")
print_top_words(lda_duc, tf_vectorizer_duc.get_feature_names(), 20)


Topics in LDA model:
Topic #0: caribbean path nafta boston shining basin region guzman 000 market country mr proposals mexico trade forest olympic government countries exports
Topic #1: forest fires service burned state acres marathon tunnel national tuesday set just amendment limits federal beach night park court mark
Topic #2: police pain says rescue hatch bank officers drought quake los angeles protesters village 50 say earthquake anti states abortion house
Topic #3: mr police bank world government says state path shining term president congress states thomas house limits countries welfare right political
Topic #4: cattle disease bse beef britain british agriculture humans cow mad feed germany ministry animal government ban transmitted offal banned jakob
Topic #5: bank debt world countries mr loans development developing treasury dollars billion lending thomas says hurricane africa poverty latin preston clarence
Topic #6: air force flight enquirer concerned police accidents taylor 

In [90]:
lda_duc = LatentDirichletAllocation(n_components=20, max_iter=5, 
      learning_method='online', learning_offset=50., random_state=0).fit(tf_duc)

print("\nTopics in LDA model:")
print_top_words(lda_duc, tf_vectorizer_duc.get_feature_names(), 20)


Topics in LDA model:
Topic #0: boston caribbean london eclipse diamond tunnel link government beers path says ap total mr 000 market nafta record park early
Topic #1: forest fires acres firefighters national park yellowstone burned 000 service brush california blaze acre contained trees burning areas crews officials
Topic #2: earthquake quake earthquakes magnitude richter scale fault damage probability area prediction allen recorded geological quakes survey madrid reading measuring francisco
Topic #3: mr state says 000 oil 10 world eclipse government states bank city officials path news national million president house term
Topic #4: disease tuberculosis bse cases cattle aids health tb cjd cow infected sheep mad agriculture scientists epidemic feed british animals ministry
Topic #5: bank world countries mr debt dollars hurricane thomas says eclipse poverty development aliens government census developing market billion clarence mexico
Topic #6: exxon air valdez oil alaska taylor millio

In [61]:
lda_duc = LatentDirichletAllocation(n_components=10, max_iter=5, 
      learning_method='online', learning_offset=50., random_state=0).fit(tf_duc)

print("\nTopics in LDA model:")
print_top_words(lda_duc, tf_vectorizer_duc.get_feature_names(), 20)


Topics in LDA model:
Topic #0: diamond beers diamonds market cso cartel south world botswana sales rough africa london says production african mines dollars prices dealers
Topic #1: forest fires firefighters tunnel acres national park 000 service officials french yellowstone california water burned british areas miles brush trees
Topic #2: earthquake quake earthquakes magnitude richter scale damage area fault probability allen pain recorded prediction survey geological quakes major reading madrid
Topic #3: mr police state world says eclipse government bank states 10 city path 000 president house news term shining congress officials
Topic #4: disease tuberculosis bse cases health cattle aids tb cjd cow infected agriculture sheep farmers mad scientists epidemic british feed dr
Topic #5: bank world debt eclipse countries mr thomas says sun dollars aliens diamond mexico development market moon census billion government africa
Topic #6: air exxon valdez oil police million eclipse french al

## Storing LDA topics and updating elastic search index for DUC

In [168]:
topic_components_duc=lda_duc.components_ / lda_duc.components_.sum(axis=1)[:, np.newaxis]

for topic_idx, topic in enumerate(topic_components_duc):  
    top_words=[]
    for i in topic.argsort()[:-11:-1]:
        top_words.append({"word":tf_vectorizer_duc.get_feature_names()[i],"probability":round(topic[i],4)})
    
    es.index(index="duc-topic-index", doc_type='ductopics', id=topic_idx, 
             body={'topic_id': topic_idx,'top_words': top_words})

In [169]:
ldaductf = lda_duc.transform(tf_duc)
for idx, dt in enumerate(ldaductf):
    doc_topics=[]
    for i in dt.argsort()[:-6:-1]:
        doc_topics.append({"topic":i,"probability":round(dt[i],4)})
    
    es.update(index="duc-index", doc_type='ducdocs', id=ducdocs.keys()[idx], 
              body={"doc": {'doc_id': ducdocs.keys()[idx],"doc_topics": doc_topics}})

In [174]:
from nltk.tokenize import sent_tokenize

In [183]:
lda_duc.transform(tfidf_vectorizer_duc.transform([sent_tokenize(duc_X[0])[8]]))

array([[0.00944115, 0.00944115, 0.00944115, 0.00944115, 0.00944115,
        0.00944115, 0.00944115, 0.00944115, 0.00944115, 0.00944115,
        0.82061817, 0.00944115, 0.00944115, 0.00944115, 0.00944115,
        0.00944115, 0.00944115, 0.00944115, 0.00944115, 0.00944115]])