In [1]:
import json
import numpy as np
import networkx as nx
import os
import time
from datetime import timedelta
import copy
import gensim
from rouge_score import rouge_scorer
import math

# Classic Textrank

In [2]:
test_dir = os.listdir("../liputan6_dataset/test") #fill with the location of test directory from liputan6 dataset
N = len(test_dir)
idf_dict=dict()

for file in test_dir:
    word_list = set()
    filename = "../liputan6_dataset/test/"+file #fill with the location of test directory from liputan6 dataset
    data = json.load(open(filename))
    article = data['clean_article']
    #get list of word
    for sentence in article:
        word_list.update(sentence)
    
    #check if word is exists in document
    for word in word_list:
        word = word.lower()
        if word not in idf_dict.keys():
            idf_dict[word] = 1
        else:
            idf_dict[word] += 1
for word, val in idf_dict.items():
    idf_dict[word] = math.log(N / float(val))

#creating bag of word dictionary
bow_dict= copy.copy(idf_dict)
bow_length = len(bow_dict.keys())
index = 0
for key in bow_dict.keys():
    bow_dict[key] = index
    index+=1

In [4]:
start_time = time.time()
print('Testing Textrank...')
test_dir = os.listdir("../liputan6_dataset/test") #fill with the location of test directory from liputan6 dataset
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=False)
f1_rouge_1 = 0
f1_rouge_2 = 0
f1_rouge_l = 0
file_counter = 0
right_answers = 0

#file iteration
for file in test_dir:
    filename = "../liputan6_dataset/test/"+file #fill with the location of test directory from liputan6 dataset
    data = json.load(open(filename))
    gold_labels = data["extractive_summary"]
    sentence_vector = []
    article = data['clean_article']
    gold_labels_words=[]
    real_sentences = []
    file_counter+=1

    #add sentences from article
    for i in range(len(article)):
        #removing news format in first sentence
        if i == 0:
            if ":" in article[i]:
                sentence = article[i][(article[i].index(":")+1):]
            else:
                sentence = article[i]                
        #removing news format in between first and last sentence
        elif i == len(article)-1:
            if "(" in article[i]:
                sentence = article[i][:-(article[i][::-1].index("("))-1]
            elif article[i].count(".") > 1:
                sentence = article[i][:-(article[i][::-1][1:].index("."))-1]
            else:
                sentence = article[i]
        #removing news format in last sentence
        else:
            sentence = article[i]
        real_sentences.append(" ".join(sentence[:-1]))


    #add sentences from gold label    
    for i in range(len(gold_labels)):
        gold_labels_words.append(real_sentences[gold_labels[i]])

    #produce sentence vectors
    for i in range(len(article)):
        sum_word_vector = [0] * bow_length
        if i == 0:
            if ":" in article[i]:
                sentence = article[i][(article[i].index(":")+1):]
            else:
                sentence = article[i]
        elif i == len(article)-1:
            if "(" in article[i]:
                sentence = article[i][:-(article[i][::-1].index("("))-1]
            elif article[i].count(".") > 1:
                sentence = article[i][:-(article[i][::-1][1:].index("."))-1]
            else:
                sentence = article[i]
        else:
            sentence = article[i]
        if len(sentence) != 0:
            for word in sentence:
                if word!= "":
                    word = word.lower()
                    sum_word_vector[bow_dict[word]]+=1
        sentence_vector.append(sum_word_vector)
    sentence_vector = np.array(sentence_vector)

    #similarity Function
    sim_mat = np.zeros([len(sentence_vector),len(sentence_vector)])
    
    for i in range(len(article)):
        for j in range(len(article)):
            if i != j:
                for token in article[i]:
                    if token.lower() in " ".join(article[j]).lower():
                        sim_mat[i][j] += 1
                sim_mat[i][j] = sim_mat[i][j]/(math.log(len(article[i])) + math.log(len(article[j])))

    #ranking function (pagerank algortithm)
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank_numpy(nx_graph)

    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(real_sentences)), reverse=True)
    
    #extract top len(gold_labels) sentences as the summary
    sentence_rank = []
    summary = []
    for i in range(len(gold_labels)):
        sentence_rank.append(real_sentences.index(ranked_sentences[i][1]))


    sentence_rank.sort()
    for sentence in sentence_rank:
        if sentence in gold_labels:
            right_answers+=1
        summary.append(real_sentences[sentence])

    #count Rouge Score
    temp_f1_rouge_1 = 0.0
    temp_f1_rouge_2 = 0.0
    temp_f1_rouge_l = 0.0
    
    rouge_res=[]
    
    for i in range(len(gold_labels)):
        rouge_res.append(scorer.score(gold_labels_words[i],summary[i]))
        temp_f1_rouge_1 += rouge_res[i]["rouge1"].fmeasure
        temp_f1_rouge_2 += rouge_res[i]["rouge2"].fmeasure
        temp_f1_rouge_l += rouge_res[i]["rougeL"].fmeasure
        
    f1_rouge_1 += temp_f1_rouge_1/len(gold_labels)
    f1_rouge_2 += temp_f1_rouge_2/len(gold_labels)
    f1_rouge_l += temp_f1_rouge_l/len(gold_labels)

#showing evaluation result
f1_rouge_1 = f1_rouge_1/file_counter
f1_rouge_2 = f1_rouge_2/file_counter
f1_rouge_l = f1_rouge_l/file_counter

finish_time = time.time()

print("Result:")
print("F1 ROUGE-1:",str(f1_rouge_1))
print("F1 ROUGE-2:",str(f1_rouge_2))
print("F1 ROUGE-L:",str(f1_rouge_l))
print("Correct Answers:",str(right_answers))
print("Tested File",str(file_counter))
print('Finished. Elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))

Testing Textrank...


  scores = nx.pagerank_numpy(nx_graph)
NetworkX version 3.0.
  M = google_matrix(


Result:
F1 ROUGE-1: 0.25390141171565805
F1 ROUGE-2: 0.1668602071434809
F1 ROUGE-L: 0.23598330164946166
Correct Answers: 4555
Tested File 10972
Finished. Elapsed time: 0:07:23.044066
