In [4]:
%run rouge.ipynb

In [3]:
from rouge import Rouge as RougeComputer

In [2]:
def greedy_label_generator(documents_folder, summary_folder, output_dump, max_sentences=20):
    '''
    This code is a simpler version of actual greedy summary generator
    TODO: code up actual greedy label generator
    '''
    rouge = RougeComputer()
    output = []
    filenames = sorted(os.listdir(documents_folder), key=lambda x: int(x))
    for filename in filenames:
        summary = open(os.path.join(summary_folder, filename)).read().replace('\n', ' \n')
        doc_lines = open(os.path.join(documents_folder, filename)).readlines()
        
        selected_lines = []
        prev_score = 0
        for no, line in zip(range(len(doc_lines[0:max_sentences])), doc_lines[0:max_sentences]):
            prev_summary = ""
            for line_ in selected_lines:
                prev_summary += doc_lines[line_] + ' '
            selected_summary = prev_summary + ' ' + line
            scores = rouge.get_scores(selected_summary, summary, avg=True)
            avg_score = (scores['rouge-1']['f'] + scores['rouge-2']['f'] + scores['rouge-l']['f'])/3
            
            if avg_score > prev_score:
                selected_lines.append(no)
                prev_score = avg_score
        output.append(selected_lines)
        print(filename, end='\r')
    pickle.dump(output, open(output_dump, 'wb+'))

In [158]:
def get_top_k_sentences(document, summary, max_select, f1=True, max_doc_len=90):
    '''
    document & summary: array of array of words
    returns top max_select sentences with highest average rouge scores
    '''
    scores = []
    
    gold = []
    for line in summary:
        gold += line
        gold.append(0)
        
    gold_1gram, gold_2gram, gold_3gram, gold_4gram = _get_ngram_sets(gold)
    
    for line in document[:max_doc_len]:
        cand_1gram, cand_2gram, cand_3gram, cand_4gram = _get_ngram_sets(line)
 
        rouge_recall_1 = 0
        if len(gold_1gram) != 0:
            rouge_recall_1 = float(len(gold_1gram.intersection(cand_1gram)))/float(len(gold_1gram))
            
        rouge_precision_1 = 0
        if len(cand_1gram) != 0:
            rouge_precision_1 = float(len(gold_1gram.intersection(cand_1gram)))/float(len(cand_1gram))
            
        rouge_f1_1 = 2 * rouge_recall_1 * rouge_precision_1 / (rouge_recall_1 + rouge_precision_1 + 10e-10)
        
        rouge_recall_2 = 0
        if len(gold_2gram) != 0:
            rouge_recall_2 = float(len(gold_2gram.intersection(cand_2gram)))/float(len(gold_2gram))
        
        rouge_precision_2 = 0
        if len(cand_2gram) != 0:
            rouge_precision_2 = float(len(gold_2gram.intersection(cand_2gram)))/float(len(cand_2gram))

        rouge_f1_2 = 2 * rouge_recall_2 * rouge_precision_2 / (rouge_recall_2 + rouge_precision_2 + 10e-10)
        
        len_lcs = _get_lcs(line, gold)
        r = 0 if (len_lcs == 0) else (float(len_lcs)/len(line))
        p = 0 if (len_lcs == 0) else (float(len_lcs)/len(gold))
        b = 0 if (r == 0) else (p / r)
        rouge_recall_l = 0 if (len_lcs == 0) else (((1+(b*b))*r*p)/(r+(b*b*p)))
        
        rouge_f1_l = 2 * r * p / (r + p + 10e-10)
        
        rouge_recall_average = (rouge_recall_1+rouge_recall_2+rouge_recall_l)/3.0
        rouge_f1_avg = (rouge_f1_1 + rouge_f1_2 + rouge_f1_l)/3.0
        if f1: scores.append(rouge_f1_avg)
        else: scores.append(rouge_recall_average)


            
    sorted_lines = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[0:max_select]
    selected_lines = list(zip(*sorted_lines))[0]
    
    return sorted(selected_lines)

In [165]:
def get_candidate_summaries(combinations, document, summary, lines_selected, max_sum_len=3, max_sum_select=15, f1=True):
    '''
    returns top max_sum_select summaries based on avg of ROUGE-1, ROUGE-2, ROUGE-L F1 scores
    '''
    scores = []
    
    gold = []
    for line in summary:
        gold += line
        gold.append(0)
        
    gold_1gram, gold_2gram, gold_3gram, gold_4gram = _get_ngram_sets(gold)

    # prepare candidate summaries
    length = min(10, len(lines_selected))
    candidates = []
    
    for i in range(max_sum_len + 1):
        candidates += combinations[length][i]
   
    
    for indices in candidates:
        candidate_summary = []
        for index in indices:
            candidate_summary += document[lines_selected[index]]
            candidate_summary.append(0)
        
        cand_1gram, cand_2gram, cand_3gram, cand_4gram = _get_ngram_sets(candidate_summary)
        
        rouge_recall_1 = 0
        if len(gold_1gram) != 0:
            rouge_recall_1 = float(len(gold_1gram.intersection(cand_1gram)))/float(len(gold_1gram))
            
        rouge_precision_1 = 0
        if len(cand_1gram) != 0:
            rouge_precision_1 = float(len(gold_1gram.intersection(cand_1gram)))/float(len(cand_1gram))
            
        rouge_f1_1 = 2 * rouge_recall_1 * rouge_precision_1 / (rouge_recall_1 + rouge_precision_1 + 10e-10)
        
        rouge_recall_2 = 0
        if len(gold_2gram) != 0:
            rouge_recall_2 = float(len(gold_2gram.intersection(cand_2gram)))/float(len(gold_2gram))
        
        rouge_precision_2 = 0
        if len(cand_2gram) != 0:
            rouge_precision_2 = float(len(gold_2gram.intersection(cand_2gram)))/float(len(cand_2gram))

        rouge_f1_2 = 2 * rouge_recall_2 * rouge_precision_2 / (rouge_recall_2 + rouge_precision_2 + 10e-10)
        
        len_lcs = _get_lcs(line, gold)
        r = 0 if (len_lcs == 0) else (float(len_lcs)/len(line))
        p = 0 if (len_lcs == 0) else (float(len_lcs)/len(gold))
        b = 0 if (r == 0) else (p / r)
        rouge_recall_l = 0 if (len_lcs == 0) else (((1+(b*b))*r*p)/(r+(b*b*p)))
        
        rouge_f1_l = 2 * r * p / (r + p + 10e-10)
        
        rouge_recall_average = (rouge_recall_1+rouge_recall_2+rouge_recall_l)/3.0
        rouge_f1_avg = (rouge_f1_1 + rouge_f1_2 + rouge_f1_l)/3.0
        if f1: scores.append(rouge_f1_avg)
        else: scores.append(rouge_recall_average)

    sorted_indices = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[0:max_sum_select]
    selected_indices, scores = list(zip(*sorted_indices))
    selected_candidates = [candidates[i] for i in selected_indices]
    
    selected_summaries = [[lines_selected[i] for i in indices] for indices in selected_candidates]

    return list(zip(selected_summaries, scores))

In [179]:
def top_k_summaries_folder(doc_folder, sum_folder, max_sum_len=4, 
                                max_sum_select=15, f1=True, dump_file=None):
    '''
    calls get_top_k_sentences and get_candidate_summaries for 
    each document in doc_folder.
    '''
    
    scores = []
    
    max_len = 10

    combinations = [[None for _ in range(max_sum_len+1)] for _ in range(max_len+1)]
    for i in range(max_len+1):
        for j in range(max_sum_len+1):
            combinations[i][j] = list(itertools.combinations(range(i), j))

    files = sorted(os.listdir(doc_folder), key=lambda x: int(x))
    for file in files:
        with open(os.path.join(doc_folder, file)) as f:
            document = [[a.lower() for a in line.split(' ')] for line in f.read().split('\n')]
        with open(os.path.join(sum_folder, file)) as f:
            summary =  [[a.lower() for a in line.split(' ')] for line in f.read().split('\n')]
        
        print(file, ' '*10, end='\r')
            
        scores.append(get_candidate_summaries(combinations, document, summary, 
                                              get_top_k_sentences(document, summary, 10), 
                                            3, 10))
    
    if dump_file:
        pickle.dump(scores, open(dump_file, 'wb+'))
    print()