In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import os
import time
from datetime import timedelta
from rouge_score import rouge_scorer
import math

# BERT_BASE Weighted

In [3]:
from transformers import FeatureExtractionPipeline
model = AutoModel.from_pretrained("indobenchmark/indobert-large-p2")
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-large-p2")
nlp = FeatureExtractionPipeline(
    model=model,
    tokenizer=tokenizer,
)

# Make IDF Dictionary

In [None]:
test_dir = os.listdir("../liputan6_dataset/test") #fill with the location of test directory from liputan6 dataset
N = len(test_dir)

bert_idf_dict=dict()
for file in test_dir:
    token_list = set()
    filename = "../liputan6_dataset/test/"+file #fill with the location of test directory from liputan6 dataset
    data = json.load(open(filename))
    article = data['clean_article']
    
    #get list of word
    for sentence in article:
        sentence = " ".join(sentence).lower()
        x = torch.LongTensor(tokenizer.encode(sentence,return_tensors="pt")).view(1,-1)
        token_list.update(x.numpy()[0])
    
    #check if word is exists in document
    for token in token_list:
        if token not in bert_idf_dict.keys():
            bert_idf_dict[token] = 1
        else:
            bert_idf_dict[token] += 1
            
for token, val in bert_idf_dict.items():
    bert_idf_dict[token] = math.log(N / float(val))

In [None]:
start_time = time.time()
print('Testing BERT_LARGE Model...')
test_dir = os.listdir("../liputan6_dataset/test") #fill with the location of test directory from liputan6 dataset
scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=False)
f1_rouge_1 = 0
f1_rouge_2 = 0
f1_rouge_l = 0
file_counter = 0
right_answers = 0
vector_size = 1024

#file iteration
for file in test_dir:
    filename = "../liputan6_dataset/test/"+file #fill with the location of test directory from liputan6 dataset
    data = json.load(open(filename))
    gold_labels = data["extractive_summary"]
    sentence_vector = []
    article = data['clean_article']
    gold_labels_words=[]
    real_sentences = []
    bert_tf_dict=dict()
    M = 0
    file_counter+=1
    document_token_list = []
    

    #add sentences from article
    for i in range(len(article)):
        #removing news format in first sentence
        if i == 0:
            if ":" in article[i]:
                sentence = article[i][(article[i].index(":")+1):]
            else:
                sentence = article[i]
        #removing news format in between first and last sentence
        elif i == len(article)-1:
            if "(" in article[i]:
                sentence = article[i][:-(article[i][::-1].index("("))-1]
            elif article[i].count(".") > 1:
                sentence = article[i][:-(article[i][::-1][1:].index("."))-1]
            else:
                sentence = article[i]
        #removing news format in last sentence
        else:
            sentence = article[i]
        real_sentences.append(" ".join(sentence[:-1]))

    #add sentences from gold label    
    for i in range(len(gold_labels)):
        gold_labels_words.append(real_sentences[gold_labels[i]])
    
    #make TF dictionary of article
    for i in range(len(article)):
        if i == 0:
            if ":" in article[i]:
                clean_sentence = article[i][(article[i].index(":")+1):]
            else:
                clean_sentence = article[i]
        elif i == len(article)-1:
            if "(" in article[i]:
                clean_sentence = article[i][:-(article[i][::-1].index("("))-1]
            elif article[i].count(".") > 1:
                clean_sentence = article[i][:-(article[i][::-1][1:].index("."))-1]
            else:
                clean_sentence = article[i]
        else:
            clean_sentence = article[i]
            
        if len(clean_sentence) != 0:
            clean_sentence = " ".join(clean_sentence).lower()
            x = torch.LongTensor(tokenizer.encode(clean_sentence,return_tensors="pt")).view(1,-1)
            token_list = x.numpy()[0][1:-1]
            document_token_list.append(token_list)
            M+=len(token_list)
            for token in token_list:
                if token in bert_tf_dict.keys():
                    bert_tf_dict[token] += 1
                else:
                    bert_tf_dict[token] = 1
    for key in bert_tf_dict:
        bert_tf_dict[key] = bert_tf_dict[key]/M

    #produce sentence vectors
    for i in range(len(article)):
        sum_word_vector = [0] * vector_size
        if i == 0:
            if ":" in article[i]:
                sentence = article[i][(article[i].index(":")+1):]
            else:
                sentence = article[i]
        elif i == len(article)-1:
            if "(" in article[i]:
                sentence = article[i][:-(article[i][::-1].index("("))-1]
            elif article[i].count(".") > 1:
                sentence = article[i][:-(article[i][::-1][1:].index("."))-1]
            else:
                sentence = article[i]
        else:
            sentence = article[i]
        if len(sentence) != 0:
            output = nlp(" ".join(sentence).lower())[0]
            for j in range(1,len(output)-1):
                token = document_token_list[i][j-1]
                for k in range(vector_size):
                    sum_word_vector[k] += output[j][k] * (bert_tf_dict[token] * bert_idf_dict[token]) / (len(output)-2)
        sentence_vector.append(sum_word_vector)
    sentence_vector = np.array(sentence_vector)

    #similarity Function
    sim_mat = np.zeros([len(sentence_vector),len(sentence_vector)])
    
    for i in range(len(article)):
        for j in range(len(article)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vector[i].reshape(1,vector_size), sentence_vector[j].reshape(1,vector_size))[0,0]

    #ranking function (pagerank algorithm)
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank_numpy(nx_graph)

    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(real_sentences)), reverse=True)
    
    #extract top len(gold_labels) sentences as the summary
    sentence_rank = []
    summary = []
    for i in range(len(gold_labels)):
        sentence_rank.append(real_sentences.index(ranked_sentences[i][1]))


    sentence_rank.sort()
    for sentence in sentence_rank:
        if sentence in gold_labels:
            right_answers+=1
        summary.append(real_sentences[sentence])

    #count Rouge Score
    temp_f1_rouge_1 = 0.0
    temp_f1_rouge_2 = 0.0
    temp_f1_rouge_l = 0.0
    
    rouge_res=[]
    
    for i in range(len(gold_labels)):
        rouge_res.append(scorer.score(gold_labels_words[i],summary[i]))
        temp_f1_rouge_1 += rouge_res[i]["rouge1"].fmeasure
        temp_f1_rouge_2 += rouge_res[i]["rouge2"].fmeasure
        temp_f1_rouge_l += rouge_res[i]["rougeL"].fmeasure
        
    f1_rouge_1 += temp_f1_rouge_1/len(gold_labels)
    f1_rouge_2 += temp_f1_rouge_2/len(gold_labels)
    f1_rouge_l += temp_f1_rouge_l/len(gold_labels)
    

#showing evaluation result
f1_rouge_1 = f1_rouge_1/file_counter
f1_rouge_2 = f1_rouge_2/file_counter
f1_rouge_l = f1_rouge_l/file_counter

finish_time = time.time()

print("Result:")
print("F1 ROUGE-1:",str(f1_rouge_1))
print("F1 ROUGE-2:",str(f1_rouge_2))
print("F1 ROUGE-L:",str(f1_rouge_l))
print("Correct Answers:",str(right_answers))
print("Tested File",str(file_counter))
print('Finished. Elapsed time: {}'.format(timedelta(seconds=finish_time-start_time)))