In [2]:
import os
import string
import itertools
import _pickle as pickle

In [3]:
from nltk.stem import PorterStemmer
import numpy as np

In [4]:
%run paths.ipynb

In [5]:
def computeRouge(folder1, folder2):
    # rouge_args="-e /home/ramkishore.s/ROUGE-1.5.5/data/ -a -c 95 -m -n 2 -w 1.2"
    from pyrouge import Rouge155
    r = Rouge155()
    r.system_dir = folder1
    r.model_dir = folder2
    r.system_filename_pattern = '(\d+)'
    r.model_filename_pattern = '#ID#'

    output = r.convert_and_evaluate()
    output = r.output_to_dict(output)
    return [output['rouge_1_f_score_ce'], output['rouge_2_f_score_ce'], output['rouge_l_f_score_ce']], output

In [6]:
class Rouge:
    '''
    class for computing rouge
    '''
    def __init__(self):
        self.stemmed_words = {}
        self.stemmer = PorterStemmer().stem
    
    def get_stem(self, word):
        if word not in self.stemmed_words:
            try: self.stemmed_words[word] = self.stemmer(word)
            except: self.stemmed_words[word] = word
        word = self.stemmed_words[word]
        return word
    
    def R1(self, pred, act, remove_stop=False, stem=False):
        pred_dict, total = {}, 0
        for word in pred:
            if len(word) > 1 or word.isalpha():
                if stem: word = self.get_stem(word)
                if word not in pred_dict: pred_dict[word] = 0
                pred_dict[word] += 1
                total += 1
            
        words_matched = 0
        words_unmatched = 0
        for word in act:
            if stem: word = self.get_stem(word)
            if len(word) > 1 or word.isalpha():
                if word in pred_dict and pred_dict[word] > 0:
                    words_matched += 1
                    pred_dict[word] -= 1
                else: 
                    words_unmatched += 1
        precision = words_matched / len(pred)
        recall = words_matched / len(act)
        try: f1 = 2 * precision * recall / (precision + recall)
        except: f1 = 0.
        
        return f1

    def R2(self, pred, act, remove_stop=False, stem=False):
        pass

    def Rl(self, pred, act, remove_stop=False, stem=False):
        pass
    
    def compute_rouge(self, predicted_summary, actual_summary, r1_=True, r2_=False, rl_=False, remove_stop=False, stem=True, sum_=True):
        '''
        predicted_summary: list of sentences
        actual_summary: list of sentences
        rl is not implemented
        '''
        pred_words = np.hstack([line.split() for line in predicted_summary])
        actual_words = np.hstack([line.split() for line in actual_summary])
        scores = []
        if r1_: scores.append(self.R1(pred_words, actual_words, remove_stop, stem))
        if r2_: scores.append(self.R2(pred_words, actual_words, remove_stop, stem))
        if rl_: scores.append(self.Rl(pred_words, actual_words, remove_stop, stem))
            
        if sum_:
            scores = sum(scores)
        
        return scores
    
    def eval_folder(self, pred_folder, gold_folder):
        pred_files = os.listdir(pred_folder)
        scores = 0
        for filename in pred_files:
            pred_summary = open(pred_folder + filename).readlines()[1:]
            actual_summary = open(gold_folder + filename).readlines()
            scores += self.compute_rouge(pred_summary, actual_summary)
        return scores / len(pred_files)

In [19]:
class RougeNeuralSum(Rouge):
    '''
    Class for creating cache data of rouge scores
    '''
    def __init__(self):
        super(RougeNeuralSum, self).__init__()
        self.lines_selected = []
        self.summary_scores = []
        self.scores = []
    
    
    def computeLineScores(self, document_folder, summary_folder, docs, no_lines):
        '''
        k => no of top summaries to store
        output: list of lists
        scores are stored in a list of lists
        dim[0] = document id
        dim[1] = (sentences no. in tuple, corresponding score)
        '''
        self.lines_selected = []
        for filename in docs:
            print(str(docs.index(filename)) + '    ', end='\r')
            scores = []
            summary = open(summary_folder + filename).readlines()
            lines = open(document_folder + filename).readlines()[0:CONFIG.MAX_DOC_LEN]
            for line in lines:
                scores.append(self.compute_rouge([line], summary))
            lines_selected = list(zip(*sorted(zip(range(len(scores)), scores), key=lambda x: x[1], reverse=True)[0:no_lines]))[0]
            self.lines_selected.append(sorted(lines_selected))
    
    def computeRefreshScores(self, document_folder, summary_folder,  no_lines=10, no_summaries=10, len_summary=3):
        docs = sorted(os.listdir(document_folder), key=lambda x: int(x))
        self.summary_scores = [None for _ in range(len(docs))]
        self.computeLineScores(document_folder, summary_folder, docs, no_lines)
        
        for filename, lines_sel in zip(docs, self.lines_selected):
            print(str(docs.index(filename)) + '     ', end='\r')
            try:
                lines = open(document_folder + filename).readlines()
                actual_summary = open(summary_folder + filename).readlines()

                possible_summaries = list(itertools.combinations(lines_sel, len_summary))
                summary_scores = []

                for psummary in possible_summaries:
                    summary_lines = [lines[i] for i in psummary]
                    summary_scores.append(self.compute_rouge(summary_lines, actual_summary))

                summaries_selected = sorted(zip(possible_summaries, summary_scores), key=lambda x: x[1], reverse=True)[0:no_summaries]
                self.summary_scores[int(filename)] = summaries_selected
            except:
                self.summary_scores[int(filename)] = None
            
    def dump(self, filename):
        pickle.dump(self.summary_scores, open(filename, 'wb+'))
    
    def load(self, filename):
        self.summary_scores = pickle.load(open(filename, 'rb'))