In [1]:
from subprocess import call
from glob import glob
from nltk.corpus import stopwords
import os, struct
from tensorflow.core.example import example_pb2
import pyrouge
import shutil
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import *

In [2]:
# cmd = '/root/miniconda2/bin/python run_summarization.py --mode=decode --single_pass=1 --coverage=True --vocab_path=finished_files/vocab --log_root=log --exp_name=myexperiment --data_path=test/temp_file --max_enc_steps=4000'
# generated_path = '/gttp/pointer-generator-tal/log/myexperiment/decode_test_4000maxenc_4beam_35mindec_100maxdec_ckpt-238410/'
# cmd = cmd.split()

vocab_path = '../data/DMQA/finished_files/vocab'
log_root = 'log'
exp_name = 'myexperiment'
data_path= 'test/temp_file'
max_enc_steps = 4000

cmd = ['python',
       'run_summarization.py',
       '--mode=decode',
       '--single_pass=1',
       '--coverage=True',
       '--vocab_path=' + vocab_path,
       '--log_root=' + log_root,
       '--exp_name=' + exp_name,
       '--data_path=' + data_path,
       '--max_enc_steps=' + str(max_enc_steps)]

generated_path = 'log/myexperiment/decode_test_4000maxenc_4beam_35mindec_100maxdec_ckpt-238410/'
max_len = 250
stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [3]:
def pp(string):
    return ' '.join([stemmer.stem(word.decode('utf8')) for word in string.lower().split() if not word in stopwords])
    
def write_to_file(article, abstract, rel, writer):
    abstract = '<s> '+' '.join(abstract)+' </s>'
    #abstract = abstract.encode('utf8', 'ignore')
    #rel = rel.encode('utf8', 'ignore')
    #article = article.encode('utf8', 'ignore')
    tf_example = example_pb2.Example()
    tf_example.features.feature['abstract'].bytes_list.value.extend([bytes(abstract)])
    tf_example.features.feature['relevancy'].bytes_list.value.extend([bytes(rel)])
    tf_example.features.feature['article'].bytes_list.value.extend([bytes(article)])
    tf_example_str = tf_example.SerializeToString()
    str_len = len(tf_example_str)
    writer.write(struct.pack('q', str_len))
    writer.write(struct.pack('%ds' % str_len, tf_example_str))


def duck_iterator(i):
    duc_folder = 'duc0' + str(i) + 'tokenized/'
    for topic in os.listdir(duc_folder + 'testdata/docs/'):
        topic_folder = duc_folder + 'testdata/docs/' + topic
        if not os.path.isdir(topic_folder):
            continue
        query = ' '.join(open(duc_folder + 'queries/' + topic).readlines())
        model_files = glob(duc_folder + 'models/' + topic[:-1].upper() + '.*')

        topic_texts = [' '.join(open(topic_folder + '/' + file).readlines()).replace('\n', '') for file in
                       os.listdir(topic_folder)]

        abstracts = [' '.join(open(f).readlines()) for f in model_files]
        yield topic_texts, abstracts, query


def count_score(sent, ref):
    ref = pp(ref).split()
    sent = ' '.join(pp(w) for w in sent.lower().split() if not w in stopwords)
    return sum([1. if w in ref else 0. for w in sent.split()])


def get_tfidf_score_func(magic = 1):
    corpus = []
    for i in range(5, 8):
        for topic_texts, _, _ in duck_iterator(i):
            corpus += [pp(t) for t in topic_texts]

    vectorizer = TfidfVectorizer()
    vectorizer.fit_transform(corpus)

    def tfidf_score_func(sent, ref):
        ref = [pp(s) for s in ref.split(' . ')]
        sent = pp(sent)
        v1 = vectorizer.transform([sent])
        v2s = [vectorizer.transform([r]) for r in ref]
        return max([cosine_similarity(v1, v2)[0][0] for v2 in v2s])
    return tfidf_score_func

In [4]:
class Summary:
    def __init__(self, texts, abstracts, query):
        
        #texts = sorted([(tfidf_score(query, text), text) for text in texts], reverse=True)
        texts = sorted([(count_score(text, ' '.join(abstracts)), text) for text in texts], reverse=True)

        texts = [text[1] for text in texts]
        self.texts = texts
        self.abstracts = abstracts
        self.query = query
        self.summary = []
        self.words = set()
        self.length = 0

    def most_similar(self, sent, text):
        return max([(count_score(s, sent), s) for s in text])[1]

    def add_sum(self, summ):
        text = self.texts.pop(0).split(' . ')
        if len(self.texts) == 0: return True
        found_sents = []
        for sent in summ:
            ms = self.most_similar(sent, text)
            if ms in found_sents:
                continue
            found_sents.append(sent)
            splitted = pp(sent).split()
            length = len(splitted) 
            splitted = set(splitted)       
            if self.length+length > max_len: return True
            if len(splitted - self.words) < int(len(splitted)*0.5): return False
            self.words |= splitted     
            self.summary.append(sent)
            self.length +=length
        return False

    def get(self):
        text = self.texts[0]
        sents = text.split(' . ')
        #score_per_sent = [(count_score(sent, self.query), sent) for sent in sents]
        score_per_sent = [(count_score(sent, ' '.join(self.abstracts)), sent) for sent in sents]

        scores = []
        for score, sent in score_per_sent:
            scores += [score] * (len(sent.split()) + 1)
        scores = str(scores[:-1])
        return text, 'a', scores

def get_summaries(path):
    path = path+'decoded/'
    out = {}
    for file_name in os.listdir(path):
        index = int(file_name.split('_')[0])
        out[index] = open(path+file_name).readlines()
    return out

def rouge_eval(ref_dir, dec_dir):
    """Evaluate the files in ref_dir and dec_dir with pyrouge, returning results_dict"""
    r = pyrouge.Rouge155()
    r.model_filename_pattern = '#ID#_reference_(\d+).txt'
    r.system_filename_pattern = '(\d+)_decoded.txt'
    r.model_dir = ref_dir
    r.system_dir = dec_dir
    return r.convert_and_evaluate()

def evaluate(summaries):
    for path in ['eval/ref', 'eval/dec']:
        if os.path.exists(path): shutil.rmtree(path, True)
        os.mkdir(path)
    for i, summ in enumerate(summaries):
        for j,abs in enumerate(summ.abstracts):
            with open('eval/ref/'+str(i)+'_reference_'+str(j)+'.txt', 'w') as f:
                f.write(abs)
        with open('eval/dec/'+str(i)+'_decoded.txt', 'w') as f:
            f.write(' '.join(summ.summary))
    print rouge_eval('eval/ref/', 'eval/dec/') 

In [5]:
for i in [6]:
    duc_num = i
    done_summaries = []
    summaries = [Summary(texts, abstracts, query) for texts, abstracts, query in duck_iterator(i)]
    while summaries:
        with open('test/temp_file', 'wb') as writer:
            for summ in summaries:
                article, abstract, scores = summ.get()
                write_to_file(article, abstracts, scores, writer)
        call(['rm', '-r', generated_path])     
        call(cmd)
        generated_summaries = get_summaries(generated_path)
        should_remove = []
        for i in range(len(summaries)):
            if summaries[i].add_sum(generated_summaries[i]):
                should_remove.append(i)
        for i in should_remove[::-1]: 
            done_summaries.append(summaries.pop(i))
    evaluate(done_summaries)
    print duc_num

2019-02-12 16:34:41,295 [MainThread  ] [INFO ]  Writing summaries.
2019-02-12 16:34:41,298 [MainThread  ] [INFO ]  Processing summaries. Saving system files to /tmp/tmpUu7EMI/system and model files to /tmp/tmpUu7EMI/model.
2019-02-12 16:34:41,300 [MainThread  ] [INFO ]  Processing files in eval/dec/.
2019-02-12 16:34:41,301 [MainThread  ] [INFO ]  Processing 0_decoded.txt.
2019-02-12 16:34:41,303 [MainThread  ] [INFO ]  Processing 1_decoded.txt.
2019-02-12 16:34:41,305 [MainThread  ] [INFO ]  Processing 2_decoded.txt.
2019-02-12 16:34:41,308 [MainThread  ] [INFO ]  Processing 3_decoded.txt.
2019-02-12 16:34:41,310 [MainThread  ] [INFO ]  Processing 4_decoded.txt.
2019-02-12 16:34:41,312 [MainThread  ] [INFO ]  Processing 5_decoded.txt.
2019-02-12 16:34:41,314 [MainThread  ] [INFO ]  Processing 6_decoded.txt.
2019-02-12 16:34:41,317 [MainThread  ] [INFO ]  Processing 7_decoded.txt.
2019-02-12 16:34:41,319 [MainThread  ] [INFO ]  Processing 8_decoded.txt.
2019-02-12 16:34:41,321 [MainThr

2019-02-12 16:34:41,522 [MainThread  ] [INFO ]  Processing 12_reference_3.txt.
2019-02-12 16:34:41,525 [MainThread  ] [INFO ]  Processing 13_reference_0.txt.
2019-02-12 16:34:41,527 [MainThread  ] [INFO ]  Processing 13_reference_1.txt.
2019-02-12 16:34:41,529 [MainThread  ] [INFO ]  Processing 13_reference_2.txt.
2019-02-12 16:34:41,531 [MainThread  ] [INFO ]  Processing 13_reference_3.txt.
2019-02-12 16:34:41,533 [MainThread  ] [INFO ]  Processing 14_reference_0.txt.
2019-02-12 16:34:41,535 [MainThread  ] [INFO ]  Processing 14_reference_1.txt.
2019-02-12 16:34:41,538 [MainThread  ] [INFO ]  Processing 14_reference_2.txt.
2019-02-12 16:34:41,540 [MainThread  ] [INFO ]  Processing 14_reference_3.txt.
2019-02-12 16:34:41,542 [MainThread  ] [INFO ]  Processing 15_reference_0.txt.
2019-02-12 16:34:41,544 [MainThread  ] [INFO ]  Processing 15_reference_1.txt.
2019-02-12 16:34:41,547 [MainThread  ] [INFO ]  Processing 15_reference_2.txt.
2019-02-12 16:34:41,549 [MainThread  ] [INFO ]  Proc

2019-02-12 16:34:41,751 [MainThread  ] [INFO ]  Processing 38_reference_3.txt.
2019-02-12 16:34:41,753 [MainThread  ] [INFO ]  Processing 39_reference_0.txt.
2019-02-12 16:34:41,756 [MainThread  ] [INFO ]  Processing 39_reference_1.txt.
2019-02-12 16:34:41,758 [MainThread  ] [INFO ]  Processing 39_reference_2.txt.
2019-02-12 16:34:41,760 [MainThread  ] [INFO ]  Processing 39_reference_3.txt.
2019-02-12 16:34:41,762 [MainThread  ] [INFO ]  Processing 40_reference_0.txt.
2019-02-12 16:34:41,764 [MainThread  ] [INFO ]  Processing 40_reference_1.txt.
2019-02-12 16:34:41,766 [MainThread  ] [INFO ]  Processing 40_reference_2.txt.
2019-02-12 16:34:41,768 [MainThread  ] [INFO ]  Processing 40_reference_3.txt.
2019-02-12 16:34:41,770 [MainThread  ] [INFO ]  Processing 41_reference_0.txt.
2019-02-12 16:34:41,773 [MainThread  ] [INFO ]  Processing 41_reference_1.txt.
2019-02-12 16:34:41,775 [MainThread  ] [INFO ]  Processing 41_reference_2.txt.
2019-02-12 16:34:41,777 [MainThread  ] [INFO ]  Proc

---------------------------------------------
1 ROUGE-1 Average_R: 0.44152 (95%-conf.int. 0.42883 - 0.45341)
1 ROUGE-1 Average_P: 0.36064 (95%-conf.int. 0.34958 - 0.37110)
1 ROUGE-1 Average_F: 0.39650 (95%-conf.int. 0.38549 - 0.40682)
---------------------------------------------
1 ROUGE-2 Average_R: 0.09449 (95%-conf.int. 0.08680 - 0.10171)
1 ROUGE-2 Average_P: 0.07703 (95%-conf.int. 0.07107 - 0.08276)
1 ROUGE-2 Average_F: 0.08476 (95%-conf.int. 0.07824 - 0.09099)
---------------------------------------------
1 ROUGE-3 Average_R: 0.03259 (95%-conf.int. 0.02835 - 0.03680)
1 ROUGE-3 Average_P: 0.02649 (95%-conf.int. 0.02314 - 0.02976)
1 ROUGE-3 Average_F: 0.02919 (95%-conf.int. 0.02543 - 0.03293)
---------------------------------------------
1 ROUGE-4 Average_R: 0.01673 (95%-conf.int. 0.01388 - 0.01976)
1 ROUGE-4 Average_P: 0.01357 (95%-conf.int. 0.01131 - 0.01608)
1 ROUGE-4 Average_F: 0.01497 (95%-conf.int. 0.01243 - 0.01769)
---------------------------------------------
1 ROUGE-L Aver