In [110]:
import string, re, os, math, json

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from rouge import Rouge

In [111]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [112]:
import tensorflow as tf
import tensorflow_hub as hub
elmo = hub.load("../../tfhub/elmo3")

# Preprocessing functions

In [113]:
#opens the files
def open_file(filename):
    file = open(filename, encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [114]:
#splits the text from the files into stories and highlights (reference summaries)
def split_text(text):
    stories = list()
    index = text.find('@highlight')
    doc, highlights = text[:index], text[index:].split('@highlight')
    stories.append({'story':doc,'highlights':highlights})

    return stories

In [115]:
#tokenizes the text to create sentence tokens, word tokens, and removes stop words
def tokenize(text):
    #we want to keep the processed and unprocessed text
    processed = list()
    unprocessed = list()
    #tokenize sentences
    sentences = sent_tokenize(text)
    for sentence in sentences:
        new_sent = list()
        #tokenize the words in the sentences
        unprocessed.append(word_tokenize(sentence))
        #remove punctuation
        tokens = word_tokenize(sentence.lower().translate(str.maketrans("","",string.punctuation)))
        #remove stop words from the sentences
        filtered_sent = [word for word in tokens if word not in stopwords.words('english')]
        for w in filtered_sent:
            new_sent.append(w)
            #new_sent.append(ps.stem(w))
            #print(w, " : ", ps.stem(w))
        processed.append(new_sent)

    #return the processed text, the original text, and the tokenized sentences in the text
    return processed, unprocessed, sentences

# Word frequency algorithm

This algorithm is used to extract topic words from the articles. It accepts a text and returns a list of the topic words in descending order.

In [116]:
#extract topic words using word frequency
def word_frequency(text):
    vocabulary = {}
    total_word_length = 0
    for x in text:
        for y in x:
            total_word_length += 1
            if y in vocabulary:
                vocabulary[y] += 1
            else:
                vocabulary[y] = 1
    #add only the top 10% of words to the list
    highest = [(vocabulary[key], key) for key in vocabulary]
    highest.sort()
    highest.reverse()
    total = len(highest)
    top = total * 0.1
    topic = list()
    for x in range(int(top)):
        topic.append(highest[x])
        #print(highest[x])

    return topic

# Centroid and sentence embedding functions

In [117]:
#makes centroid embeddings from word2vec vectors
def make_vec(words):
    embedding = [1]
    for x in range(len(words)):
        if words[x][1] in wv:
            embedding += wv[words[x][1]]
    return embedding

#makes the sentence vectors from word2vec embeddings
def sent_vec(sentences):
    embedding = list()
    #for each sentence
    for x in range(len(sentences)):
        temp = [1]
        #if the word is in word2vec, add to the embedding
        for word in sentences[x]:
            if word in wv:
                temp += wv[word]
        embedding.append(temp)
    return embedding

#make the centroid vectors from elmo embeddings
def make_elmo_centroid(words):
    temp = ""
    for x in range(len(words)):
        temp += words[x][1] + " "
    embedding = elmo.signatures["default"](tf.constant([temp]))["elmo"]
    return embedding

#make the sentence vectors from elmo embeddings
def make_elmo_vec(words):
    embedding = elmo.signatures["default"](tf.constant(words))["elmo"]
    return embedding

# Cosine similarity functions

In [118]:
#get the cosine similarity of each sentence in a document and the centroid for each document
def cos_sim(centroid,corpus):
    cos_sim_sents = []
    #for each doucment
    for x in range(len(corpus)):
        #arrays should be np
        centroid_vec = np.array([centroid[x]])
        sentences = []
        #for each sentence in the document
        for y in range(len(corpus[x])):
            if len(corpus[x][y]) != 1:
                sentence = np.array([corpus[x][y]])
                #calculate the cosine similarity of the sentence and the centroid
                sentences.append((cosine_similarity(centroid_vec,sentence).tolist(),y))
            #sort sentences from high to low
            sentences.sort(reverse=True)
        cos_sim_sents.append(sentences)

    return cos_sim_sents

def elmo_cos_sim(centroid,corpus):
    cos_sim_sents = []
    for x in range(len(corpus)):
        cur_centroid = 0
        for y in range(len(centroid[x][0])):
            cur_centroid = np.add(cur_centroid, centroid[x][0][y])
        cur_centroid = np.array([cur_centroid])
        sentences = []
        for y in range(len(corpus[x])):
            cur_sent = 0
            
            for z in range(len(corpus[x][y])):
                cur_sent = np.add(cur_sent, corpus[x][y][z])
            cur_sent = np.array([cur_sent])
            
            sentences.append((cosine_similarity(cur_centroid,cur_sent).tolist(),y))
            sentences.sort(reverse=True)
        cos_sim_sents.append(sentences)
    return cos_sim_sents

# Prepare dataset

In [119]:
directory = '../data/'
files = os.listdir(directory)
stories = []
for file in files:
    filename = directory + '/' + file
    text = open_file(filename)
    stories.append(split_text(text))
    
#these hold the preprocessed and unprocessed summaries
corpus_p = list()
corpus_u = list()
temp_stories = list()
corpus_sentences = list()

#for each document, preprocess the document by tokenizing the sentences and words and removing stop words
for x in range(len(stories)):
    temp_processed, temp_unprocessed, sentences = tokenize(stories[x][0]['story'])
    corpus_p.append(temp_processed)
    corpus_u.append(temp_unprocessed)
    corpus_sentences.append(sentences)

# Topic word extraction

Topic words for each article in the corpus are extracted

In [120]:
#run the word frequency algorithm
wf_topic = list()
#get topic words for each document
for x in range(len(corpus_p)):
    wf_topic.append(word_frequency(corpus_p[x]))


# ELMo centroid and sentence embeddings
1. create the centroid embeddings for each article using the topic words
2. create the sentence embeddings for each article
3. calculate the cosine similarity between the centroid embeddings and each sentence embedding in each article

In [121]:
#create the centroid embeddings for each article
elmo_centroid = list()
for x in range(len(wf_topic)):
    elmo_centroid.append(make_elmo_centroid(wf_topic[x]))
    
#create vectors for every sentence in each article
elmo_sentences = list()
for x in range(len(corpus_sentences)):
    elmo_sentences.append(make_elmo_vec(corpus_sentences[x]))
    
#calculate the cosine similarity between the centroids and the sentences
elmo_cosine = elmo_cos_sim(elmo_centroid,elmo_sentences)

# Word2Vec centroid and sentence embeddings
1. create the centroid embeddings for each article using the topic words
2. create the sentence embeddings for each article
3. calculate the cosine similarity between the centroid embeddings and each sentence embedding in each article

In [122]:
#create the centroid embeddings for each article
wv_centroid = list()
for x in range(len(wf_topic)):
    wv_centroid.append(make_vec(wf_topic[x]))
    
#create vectors for every sentence in each article
wv_sentences = list()
for x in range(len(corpus_p)):
    wv_sentences.append(sent_vec(corpus_p[x]))
    
#calculate the cosine similarity between the setneces and the centroids
wv_cosine = cos_sim(wv_centroid,wv_sentences)

# Create summaries and write them to summaries.json 

In [123]:
sum_dict = {}
for x in range(len(stories)):
    tempdict = {}

    #this tells us how many sentences to extract from each document, it needs to be the same number of
    #sentences in the highlights because we are using ROUGE to compare them.
    cur = len(stories[x][0]['highlights'])
    
    tempstr = ""
    #add the reference summary to the json file
    for y in range(1,cur):
        tempstr += stories[x][0]['highlights'][y].strip('\n') + ". "
    #store the reference summary in the temporary dictionary
    tempdict['reference'] = tempstr
    
    #add the word2vec summary to the json file
    tempstr = ""
    for y in range(cur-1):
        #the summary is created by using the index of the highest scoring sentences
        #to get the correct sentences from the unprocessed article
        tempstr += TreebankWordDetokenizer().detokenize(corpus_u[x][wv_cosine[x][y][1]]) + " "
    #store the word2vec summary in the temporary dictionary
    tempdict['word2vec'] = tempstr
    
    
    #add the elmo summary to the json file
    tempstr = ""
    for y in range(cur-1):
        #the summary is created by using the index of the highest scoring sentences
        #to get the correct sentences from the unprocessed article
        tempstr += TreebankWordDetokenizer().detokenize(corpus_u[x][elmo_cosine[x][y][1]]) + " "
    #store the elmo summary in the temporary dictionary
    tempdict['elmo'] = tempstr
    
    #store the temporary dictionary in the summary dictionary which will contain all summaries
    sum_dict[str(x)] = tempdict
    
#write the summary dictionary to the json file
with open("summaries.json", "w") as outfile:
    json.dump(sum_dict, outfile,indent=4)

In [124]:
#this code reopens the json file and stores the contents in arrays
#it is redundant to do this, but oh well
with open('summaries.json') as json_file:
    data = json.load(json_file)
    
references = []
wv_summaries = []
elmo_summaries = []
for x in data:
    references.append(data[x]['reference'].translate(str.maketrans("","",string.punctuation)))
    wv_summaries.append(data[x]['word2vec'].translate(str.maketrans("","",string.punctuation)))
    elmo_summaries.append(data[x]['elmo'].translate(str.maketrans("","",string.punctuation)))

# Calculate the average ROUGE scores for the Word2Vec and ELMo models

In [125]:
#make a rouge class object
rouge = Rouge()

In [126]:
#these will hold the rouge scores for the word2vec and elmo models
wv_rouge1 = [0,0,0]
wv_rouge2 = [0,0,0]
wv_rougeL = [0,0,0]
elmo_rouge1 = [0,0,0]
elmo_rouge2 = [0,0,0]
elmo_rougeL = [0,0,0]

#sum the rouge scores for word2vec and elmo summaries
for x in range(len(references)):
    #rouge scores for word2vec
    scores = rouge.get_scores(references[x],wv_summaries[x])
    wv_rouge1[0] += scores[0]['rouge-1']['r']
    wv_rouge1[1] += scores[0]['rouge-1']['p']
    wv_rouge1[2] += scores[0]['rouge-1']['f']

    wv_rouge2[0] += scores[0]['rouge-2']['r']
    wv_rouge2[1] += scores[0]['rouge-2']['p']
    wv_rouge2[2] += scores[0]['rouge-2']['f']

    wv_rougeL[0] += scores[0]['rouge-l']['r']
    wv_rougeL[1] += scores[0]['rouge-l']['p']
    wv_rougeL[2] += scores[0]['rouge-l']['f']
    
    #rouge scores for elmo
    scores = rouge.get_scores(references[x],elmo_summaries[x])
    elmo_rouge1[0] += scores[0]['rouge-1']['r']
    elmo_rouge1[1] += scores[0]['rouge-1']['p']
    elmo_rouge1[2] += scores[0]['rouge-1']['f']

    elmo_rouge2[0] += scores[0]['rouge-2']['r']
    elmo_rouge2[1] += scores[0]['rouge-2']['p']
    elmo_rouge2[2] += scores[0]['rouge-2']['f']

    elmo_rougeL[0] += scores[0]['rouge-l']['r']
    elmo_rougeL[1] += scores[0]['rouge-l']['p']
    elmo_rougeL[2] += scores[0]['rouge-l']['f']

In [127]:
#calculate the average of the rouge scores for word2vec and elmo
count = len(references)
for x in range(3):
    wv_rouge1[x] = wv_rouge1[x]/count
    wv_rouge2[x] = wv_rouge2[x]/count
    wv_rougeL[x] = wv_rougeL[x]/count

    elmo_rouge1[x] = elmo_rouge1[x]/count
    elmo_rouge2[x] = elmo_rouge2[x]/count
    elmo_rougeL[x] = elmo_rougeL[x]/count

In [128]:
print('\t\t\trecall\t\t     precision\t\t  f1')
print('ROUGE-1 w2v:\t', wv_rouge1)
print('ROUGE-1 elmo:\t', elmo_rouge1)
print()
print('ROUGE-2 w2v:\t',wv_rouge2)
print('ROUGE-2 elmo:\t', elmo_rouge2)
print()
print('ROUGE-L w2v:\t',wv_rougeL)
print('ROUGE-L elmo:\t',elmo_rougeL)

			recall		     precision		  f1
ROUGE-1 w2v:	 [0.12645142914827953, 0.30833395268177877, 0.17748443126713617]
ROUGE-1 elmo:	 [0.14357858136674712, 0.3283351913786697, 0.19896955416670864]

ROUGE-2 w2v:	 [0.017095959595959595, 0.05228571428571429, 0.02543093416917817]
ROUGE-2 elmo:	 [0.017872055104239545, 0.053000000000000005, 0.026640326531552093]

ROUGE-L w2v:	 [0.06810666695509215, 0.1671869193608324, 0.09584485512889533]
ROUGE-L elmo:	 [0.07769214415083574, 0.18131797349188652, 0.10835475056384565]
