In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from elasticsearch import Elasticsearch
import os
from time import time
import re
from bs4 import BeautifulSoup
import numpy as np
import nltk
import math
from nltk.tokenize import sent_tokenize
import nltk
from nltk.corpus import stopwords
from pyrouge import Rouge155
stopset=set(stopwords.words('english'))

# Indexing Newsgroups in elasticsearch

In [2]:
es = Elasticsearch(['localhost'],
    http_auth=('elastic', 'elastic'),
    scheme="http",
    port=9200,)

In [3]:
ng = fetch_20newsgroups(subset='all',remove=('headers'))
ng_X = ng.data

ngdocs={}
for i in range(len(ng_X)):
    ngdocs[i]=ng_X[i].replace('\n','')

In [97]:
ng = fetch_20newsgroups(subset='all',remove=('headers','footers'))
#ng_X=([re.sub("[^a-zA-Z ]",""," ".join(k.replace('\n','').lower() for k in i.split())) for i in ng.data])
ng_X=ng.data

# Use tf-idf features for NMF.
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, min_df=3,max_features=10000,stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(ng_X)


# Use tf (raw term count) features for LDA.
tf_vectorizer = CountVectorizer(max_df=0.5, min_df=3,max_features=10000,stop_words='english')
tf = tf_vectorizer.fit_transform(ng_X)

lda = LatentDirichletAllocation(n_components=20, max_iter=50, 
      learning_method='online', learning_offset=50., random_state=0).fit(tf)

In [99]:
es.indices.refresh(index="newsgroups-index")

res = es.search(index="newsgroups-index", body={"query": {"match_all": {}}})

In [None]:
klscores=[]
ldascores=[]

for i in range(len(ng_X)):

    origdoc = " ".join([j for j in ng_X[i].split()])

    klsummary = " ".join(kl_summarization(sent_tokenize(origdoc),len(sent_tokenize(origdoc))*0.5))
    ldasummary = " ".join(topic_summarization(sent_tokenize(origdoc),len(sent_tokenize(origdoc))*0.5))
    
    es.update(index="newsgroups-index", doc_type='newsgroups', id=i, 
      body={"doc": {'doc_id': i,'LDA_Summary': ldasummary, 'KL_Summary':klsummary}})
    

# Indexing DUC dataset in elasticsearch

In [51]:
ducpath="/Users/sasankauppu/Desktop/Data Mining CS6220/DataMining/DUC2001/"
ducdocs={}
ducsum={}
for f in os.listdir(ducpath+"raw_data"):
    if f!=".DS_Store":
        ducdocs[f.lower()]=BeautifulSoup(open(ducpath+'raw_data/'+f,'r').read(), "lxml").find("text").text.replace('\n','')
    
for f in os.listdir(ducpath+"Summaries"):
    if f!=".DS_Store":
        ducsum[f[:-4].lower()]=open(ducpath+'Summaries/'+f,'r').read().replace('\n','')

In [37]:
duc_X = ducdocs.values()

# Use tf-idf features for NMF.
tfidf_vectorizer_duc = TfidfVectorizer(max_df=0.5, min_df=3,stop_words='english')
tfidf_duc = tfidf_vectorizer_duc.fit_transform(duc_X)

# Use tf (raw term count) features for LDA.
tf_vectorizer_duc = CountVectorizer(max_df=0.5, min_df=3,stop_words='english')
tf_duc = tf_vectorizer_duc.fit_transform(duc_X)

lda_duc = LatentDirichletAllocation(n_components=20, max_iter=50, 
      learning_method='online', learning_offset=50., random_state=0).fit(tf_duc)

In [None]:
klscores=[]
ldascores=[]

for i in ducdocs:

    origdoc = " ".join([j for j in ducdocs[i].split()])

    klsummary = " ".join(kl_summarization(sent_tokenize(origdoc),len(sent_tokenize(origdoc))*0.5))
    ldasummary = " ".join(topic_summarization(sent_tokenize(origdoc),len(sent_tokenize(origdoc))*0.5))
    
    if(i in ducsum):
        originalsum = " ".join([j for j in ducsum[i].split()])

        klscores.append(rouge_n(sent_tokenize(klsummary),sent_tokenize(originalsum)))
        ldascores.append(rouge_n(sent_tokenize(ldasummary),sent_tokenize(originalsum))

    es.update(index="duc-index", doc_type='ducdocs', id=i, 
      body={"doc": {'doc_id': i,'LDA_Summary': ldasummary, 'KL_Summary':klsummary}})


In [80]:
kltot=[sum(x) for x in zip(*klscores)]
ldatot=[sum(x) for x in zip(*ldascores)]

print "KL Precision: ",kltot[0]/len(klscores),"Recall: ",kltot[1]/len(klscores),"FScore: ",kltot[2]/len(klscores)
print "LDA Precision: ",ldatot[0]/len(ldascores),"Recall: ",ldatot[1]/len(ldascores),"FScore: ",ldatot[2]/len(ldascores)

KL Precision:  0.967942879516 Recall:  0.398614247811 FScore:  0.558958968113
LDA Precision:  0.970624265337 Recall:  0.502696877352 FScore:  0.653761659023


In [8]:
def merged_freq(wlist1, wlist2):
    wc1 = compute_word_freq(wlist1)
    wc2 = compute_word_freq(wlist2)
    merged = wc1.copy()

    for k in wc2:
        if k in merged: 
            merged[k] += wc2[k]
        else:
            merged[k] = wc2[k]

    for k in merged:
        merged[k] /= float(len(wlist1) + len(wlist2))

    return merged
    
def compute_word_freq(wlist1):
    word_freq = {}
    for w in wlist1:
        word_freq[w] = word_freq.get(w, 0) + 1
    return word_freq


In [27]:
def kl_divergence(summary_freq, doc_freq):
    sum_val = 0
    for w in summary_freq:
        frequency = doc_freq.get(w)
        if frequency:
            sum_val += frequency * math.log(frequency / summary_freq[w])
    return sum_val


def kl_summarization(sentences,summary_length):
    word_freq = [w.lower() for s in sentences for w in s.split()]
    wcount = len(word_freq)
    word_freq = compute_word_freq(word_freq)
    word_freq = dict((w, float(f) / wcount) for w, f in word_freq.items())
    
    klsummary = {}
    summary = []
    vocab = [[w.lower() for w in s.split()] for s in sentences]
    origsentences = sentences[:]

    while len(sentences) > 0 and len(klsummary)<=summary_length:
        kls = []
        summarysplit = [w for s in summary for w in s.split()]

        for s in vocab:
            joint_freq = merged_freq(s, summarysplit)
            kls.append(kl_divergence(joint_freq, word_freq))

        ind = kls.index(min(kls))
        top_sentence = sentences.pop(ind)
        del vocab[ind]
        summary.append(top_sentence)

        klsummary[top_sentence] =  origsentences.index(top_sentence)
    

    return sorted(klsummary, key=klsummary.get)

In [98]:
def kl_divergence_lists(summary_freq, doc_freq):
    sum_val = 0
    for w in range(len(summary_freq)):
        frequency = doc_freq[w]
        sum_val += frequency * math.log(frequency / summary_freq[w])
    return sum_val

def topic_summarization(sentences,summary_length):
    word_freq = lda.transform(tfidf_vectorizer.transform(["".join(sentences)]))
    
    klsummary = {}
    summary = []
    vocab = [s for s in sentences]
    origsentences = sentences[:]

    while len(sentences) > 0 and len(klsummary)<=summary_length:
        kls = []

        for s in vocab:
            joint_freq = lda.transform(tfidf_vectorizer.transform(["".join(summary)+s]))
            
            kls.append(kl_divergence_lists(joint_freq[0], word_freq[0]))
            
        ind = kls.index(min(kls))
        top_sentence = sentences.pop(ind)
        del vocab[ind]
        summary.append(top_sentence)

        klsummary[top_sentence] =  origsentences.index(top_sentence)
    

    return sorted(klsummary, key=klsummary.get)

In [76]:

from nltk.tokenize import word_tokenize


def generate_ngrams(n, text):
    ngram_set = set()
    text_length = len(text)
    max_index_ngram_start = text_length - n
    for i in range(max_index_ngram_start + 1):
        ngram_set.add(tuple(text[i:i + n]))
    return ngram_set



def ngrams(n, sentences):
    words = set()
    for sentence in sentences:
        sentence = " ".join([w for w in sentence.split() if not w in stopset])
        words.update(generate_ngrams(n, sentence.split()))
        
    return words


def rouge_n(evaluated_sentences, reference_sentences, n=2):
    
    evaluated_ngrams = ngrams(n, evaluated_sentences)
    reference_ngrams = ngrams(n, reference_sentences)
    
    overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)

    precision = float(len(overlapping_ngrams)) / len(evaluated_ngrams)
    recall = float(len(overlapping_ngrams)) / len(reference_ngrams)
    fscore = (2*precision*recall)/(precision+recall)
    
    return (precision,recall,fscore)