# Imports

In [1]:
import os
import re
import json
from pathlib import Path
from pythonrouge.pythonrouge import Pythonrouge
from qbsum.corpus import Corpus
from qbsum.summarizers import MMR

# Setup

Define directory structure:

In [2]:
# Read directories description from file:
with open('directories.json') as f:
    directories = json.load(f)

baseDir = Path(os.getcwd())
documentsDir = baseDir / directories['test']['documents']
queriesDir = baseDir / directories['test']['queries']
referencesDir = baseDir / directories['test']['references']
summariesDir = baseDir / directories['test']['summaries']

if not summariesDir.is_dir():
    os.mkdir(summariesDir)

mmrDir = summariesDir/'MMR'
if not mmrDir.is_dir():
    os.mkdir(mmrDir)

Load the test corpus:

In [3]:
corpus = Corpus(documentsDir, queriesDir, referencesDir)

# Tests

Initialize summarizer:

In [4]:
candidates = []
mmr = MMR()

Generate summaries:

In [5]:
# For each documents set in corpus:
for i in range(len(corpus.queries)):

    documentSetName = corpus.documentSetNames[i]

    # Create a summary:
    summaryFile = documentSetName + '.txt'
    summary = mmr.summarize(corpus.documents[i],
        corpus.queries[i],
        mmrDir/summaryFile,
        nbWords=10,
        lda=0.9)
    candidates.append(summary)
    

    print("\nDOC SET {}".format(documentSetName))
    print("\tQUERY\t", corpus.queries[i])
    for document in corpus.documents[i]:
        print("\tDOC\t", document.text[:90], "...")
    print("\tSUM\t", summary)
    print("\tREF\t", corpus.references[i])


DOC SET 01
	QUERY	 Why did Tesla share prices fell?
	DOC	 What is Tesla without Elon Musk? That was the question Wall Street was asking on Friday as ...
	DOC	 Elon Musk is to step down as chair of Tesla for three years and pay a fine after reaching  ...
	SUM	 ['Last Friday Tesla’s share price was down close to 14% as investors lost confidence.']
	REF	 [['Tesla share prices fell following a SEC suit accusing Musk of fraud.']]

DOC SET 02
	QUERY	 Why did Facebook share prices fell?
	DOC	 Time for a recap: Around $36bn has been wiped off Facebook’s market capitalisation, after  ...
	DOC	 Politicians and regulators won’t be the only people who want a word with Mark Zuckerberg.  ...
	SUM	 ['In the circumstances, the severe reaction in the share price is not a surprise.']
	REF	 [["Facebook's shares fell following the data breach involving Cambridge Analytica."]]


# Evaluation

Read ROUGE configuration from file:

In [6]:
with open('rouge_args.json') as f:
    rouge_args = json.load(f)
rouge_args

{'summary_file_exist': False,
 'n_gram': 2,
 'ROUGE_SU4': True,
 'ROUGE_L': False,
 'recall_only': False,
 'stemming': True,
 'stopwords': True,
 'word_level': True,
 'length_limit': True,
 'length': 10,
 'use_cf': False,
 'cf': 95,
 'scoring_formula': 'average',
 'resampling': True,
 'samples': 1000,
 'favor': True,
 'p': 0.5}

Configure pythonrouge:

In [7]:
rouge = Pythonrouge(summary=candidates, reference=corpus.references,
                    summary_file_exist=rouge_args['summary_file_exist'],
                    n_gram=rouge_args['n_gram'],
                    ROUGE_SU4=rouge_args['ROUGE_SU4'],
                    ROUGE_L=rouge_args['ROUGE_L'],
                    recall_only=rouge_args['recall_only'],
                    stemming=rouge_args['stemming'],
                    stopwords=rouge_args['stopwords'],
                    word_level=rouge_args['word_level'],
                    length_limit=rouge_args['length_limit'],
                    length=rouge_args['length'],
                    use_cf=rouge_args['use_cf'],
                    cf=rouge_args['cf'],
                    scoring_formula=rouge_args['scoring_formula'],
                    resampling=rouge_args['resampling'],
                    samples=rouge_args['samples'],
                    favor=rouge_args['favor'],
                    p=rouge_args['p'])

Evaluate candidate summaries:

In [8]:
print("\nEVALUATION")
score = rouge.calc_score()
for key in score.keys():
    print("\t" + key + "\t {}".format(score[key]))


EVALUATION
	ROUGE-1-R	 0.25
	ROUGE-1-F	 0.27693
	ROUGE-2-R	 0.14285
	ROUGE-2-F	 0.15384
	ROUGE-SU4-R	 0.10938
	ROUGE-SU4-F	 0.12519
