In [1]:
import tensorflow as tf
import pyrouge
import os
import logging
import sys

  from ._conv import register_converters as _register_converters


In [2]:
data_path = 'summarize_bert_baseline-cnn_dm-11-21-1/'
ref_dir = os.path.join('../data/', data_path, 'results/ref')
decode_dir = os.path.join('../data/', data_path, 'results/pred')

In [3]:
def rouge_eval(ref_dir, dec_dir):
    """Evaluate the files in ref_dir and dec_dir with pyrouge, returning results_dict"""
    r = pyrouge.Rouge155()
    r.model_filename_pattern = '#ID#_reference.txt'
    r.system_filename_pattern = '(\d+)_decoded.txt'
    r.model_dir = ref_dir
    r.system_dir = dec_dir
    logging.getLogger('global').setLevel(logging.WARNING)  # silence pyrouge logging
    rouge_results = r.convert_and_evaluate()
    return r.output_to_dict(rouge_results)


def rouge_log(results_dict, dir_to_write):
    """Log ROUGE results to screen and write to file.

    Args:
      results_dict: the dictionary returned by pyrouge
      dir_to_write: the directory where we will write the results to"""
    log_str = ""
    for x in ["1", "2", "l"]:
        log_str += "\nROUGE-%s:\n" % x
        for y in ["f_score", "recall", "precision"]:
            key = "rouge_%s_%s" % (x, y)
            key_cb = key + "_cb"
            key_ce = key + "_ce"
            val = results_dict[key]
            val_cb = results_dict[key_cb]
            val_ce = results_dict[key_ce]
            log_str += "%s: %.4f with confidence interval (%.4f, %.4f)\n" % (key, val, val_cb, val_ce)
    print(log_str)  # log to screen
    results_file = os.path.join(dir_to_write, "ROUGE_results.txt")
    print("Writing final ROUGE results to %s...", results_file)
    with open(results_file, "w", encoding='utf-8') as f:
        f.write(log_str)

In [6]:
results_dict = rouge_eval(ref_dir, decode_dir)
rouge_log(results_dict, decode_dir)


ROUGE-1:
rouge_1_f_score: 0.3113 with confidence interval (0.3065, 0.3158)
rouge_1_recall: 0.2722 with confidence interval (0.2677, 0.2765)
rouge_1_precision: 0.3965 with confidence interval (0.3900, 0.4030)

ROUGE-2:
rouge_2_f_score: 0.1376 with confidence interval (0.1335, 0.1415)
rouge_2_recall: 0.1191 with confidence interval (0.1155, 0.1228)
rouge_2_precision: 0.1784 with confidence interval (0.1731, 0.1839)

ROUGE-l:
rouge_l_f_score: 0.2896 with confidence interval (0.2851, 0.2940)
rouge_l_recall: 0.2530 with confidence interval (0.2488, 0.2573)
rouge_l_precision: 0.3692 with confidence interval (0.3631, 0.3754)

Writing final ROUGE results to %s... ../data/summarize_bert_baseline-cnn_dm-11-21-1/results/pred\ROUGE_results.txt
