## ROUGE script

A script to run the ROUGE evaluation metric. Model outputs must be reconstructed before the ROUGE score is calculated and the script provides different reconstruction functions depending on the model which is being evaluated.

ROUGE is implemented using the easy-rouge library: <https://pypi.org/project/easy-rouge>

In [None]:
from nltk import word_tokenize
from rouge.rouge import rouge_n_sentence_level
from rouge.rouge import rouge_l_sentence_level
from LSTM_reconstruct import LSTM_reconstruct
from BERT_reconstruct import BERT_reconstruct
from BERT_reconstruct import BERT_rules_reconstruct
import pickle

In [None]:
# Read the test data

def read_data(test_file):
    with open(test_file, 'rb') as f:
        data = pickle.load(f)
        return data

In [None]:
test_file = "" # Path to test output
test_data = read_data(test_file)

In [None]:
# If the output is from a BERT_rules ensemble model: reconstruct those outputs

rules_data = read_data("") # Path to rules output

pred_reconstructions, target_reconstructions, originals = BERT_rules_reconstruct(test_data, rules_data)

In [None]:
# If the output is from a LSTM model: reconstruct those outputs

target_reconstructions = []
pred_reconstructions = []

for i in range(len(test_data['predictions'])):
    pred_recon, target_recon = LSTM_reconstruct(test_data['predictions'][i],
                                           test_data['targets'][i])
    pred_reconstructions.append(pred_recon)
    target_reconstructions.append(target_recon)

In [None]:
# If the output is from a BERT model: reconstruct those outputs

pred_reconstructions, target_reconstructions, _ = BERT_reconstruct(test_data)

In [None]:
# If the output is from a Rules based model: reconstruct those outputs

target_reconstructions = []
pred_reconstructions = []

for i in range(len(test_data['predictions'])):
    target_reconstructions.append(test_data['targets'][i])
    pred_reconstructions.append(test_data['predictions'][i])

In [None]:
# Calculate average ROUGE-n

count = len(pred_reconstructions)
rolling_ROUGEn_r = 0
rolling_ROUGEn_p = 0
rolling_ROUGEn_f = 0

for i in range(len(pred_reconstructions)):
    token_target = word_tokenize(target_reconstructions[i])
    token_prediction = word_tokenize(pred_reconstructions[i])
    recall, precision, rouge = rouge_n_sentence_level(token_prediction, token_target, 2)
    rolling_ROUGEn_r += recall
    rolling_ROUGEn_p += precision
    rolling_ROUGEn_f += rouge
    
ROUGEn_r = rolling_ROUGEn_r/count
ROUGEn_p = rolling_ROUGEn_p/count
ROUGEn_f = rolling_ROUGEn_f/count

print(ROUGEn_r)
print(ROUGEn_p)
print(ROUGEn_f)

In [None]:
# Calculate average ROUGE-l

count = len(pred_reconstructions)
rolling_ROUGEl_r = 0
rolling_ROUGEl_p = 0
rolling_ROUGEl_f = 0

for i in range(len(pred_reconstructions)):
    token_target = word_tokenize(target_reconstructions[i])
    token_prediction = word_tokenize(pred_reconstructions[i])
    recall, precision, rouge = rouge_l_sentence_level(token_prediction, token_target)
    rolling_ROUGEl_r += recall
    rolling_ROUGEl_p += precision
    rolling_ROUGEl_f += rouge
    
ROUGEl_r = rolling_ROUGEl_r/count
ROUGEl_p = rolling_ROUGEl_p/count
ROUGEl_f = rolling_ROUGEl_f/count

print(ROUGEl_r)
print(ROUGEl_p)
print(ROUGEl_f)