In [1]:
import os
import sys
from bleu.bleu import Bleu
from meteor.meteor import Meteor
from rouge.rouge import Rouge
from collections import defaultdict
from argparse import ArgumentParser
import json
from json import encoder

In [2]:
# src = gold questions 
# tgt = 
# out = generated questions 
# src_file = "../results/squad_sentences.txt"
# tgt_file = "../results/squad_gold_questions.txt"
# out_file = "../results/squad_generated_questions.txt"
# "tgt-dev.txt", "tgt-dev.txt", "tgt-dev.txt"

path = "cleaned_results/"

src_file = "tgt-dev.txt"
tgt_file = "tgt-dev.txt"
out_file = "tgt-dev.txt"

In [3]:
class QGEvalCap:
    def __init__(self, gts, res):
        self.gts = gts
        self.res = res

    def evaluate(self):
        output = []
        scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
                   #(Meteor(),"METEOR"),
                   (Rouge(), "ROUGE_L")
                  ]
        
        for scorer, method in scorers:
            score, scores = scorer.compute_score(self.gts, self.res)
            if type(method) == list:  #if calculating Bleu
                for sc, scs, m in zip(score, scores, method):
                    print ("%s: %0.5f"%(m, sc)) ##### need update 
                    output.append(sc)
            else:
                print ("%s: %0.5f"%(method, score)) #### need update
                output.append(score)
        return output

In [4]:
def main(src_file, tgt_file, out_file):
    pairs = []
    with open(src_file, 'r', encoding = "utf-8") as infile:
        for line in infile:
            pair = {}
            pair['tokenized_sentence'] = line[:-1]
            pairs.append(pair)

    with open(tgt_file, "r", encoding = "utf-8") as infile:
        cnt = 0
        for line in infile:
            pairs[cnt]['tokenized_question'] = line[:-1]
            cnt += 1

    output = []
    with open(out_file, 'r', encoding = "utf-8") as infile:
        for line in infile:
            line = line[:-1]
            output.append(line)


    for idx, pair in enumerate(pairs):
        #print(idx)
        pair['prediction'] = output[idx]

    encoder.FLOAT_REPR = lambda o: format(o, '.4f')

    res = defaultdict(lambda: [])
    gts = defaultdict(lambda: [])
    problem_indices = []
    for i, pair in enumerate(pairs):
    
        key = pair['tokenized_sentence'].lower()
        res[key] = [pair['prediction'].encode('utf-8').lower()]
        ## gts 
        gts[key].append(pair['tokenized_question'].encode('utf-8').lower())
    #print(gts)
    #print(res)
    QGEval = QGEvalCap(gts, res)
    QGEval.evaluate()



## Comparing generated questions for sentences that have multiple answers

In [48]:
src_file = os.path.join(path,"remain_multiple_set_1.txt")
tgt_file = os.path.join(path,"remain_multiple_set_1.txt")
out_file = os.path.join(path,"remain_multiple_set_2.txt")

main(src_file,tgt_file,out_file)

Bleu_1: 0.63914
Bleu_2: 0.53453
Bleu_3: 0.47332
Bleu_4: 0.42412
ROUGE_L: 0.62854


# SQuAD Dataset

## Comparing one-to-one (sentence, question) with gold standard

In [42]:
src_file = os.path.join(path,"sq_squad_one_to_one_contexts.txt")
tgt_file = os.path.join(path,"sq_squad_one_to_one_gold.txt")
out_file = os.path.join(path,"sq_squad_one_to_one_generated.txt")

main(src_file,tgt_file,out_file)

Bleu_1: 0.37537
Bleu_2: 0.20901
Bleu_3: 0.13656
Bleu_4: 0.09469
ROUGE_L: 0.35613


## Comparing one-to-many (sentence, question)  with selected gold standard

In [50]:
src_file = os.path.join(path,"sq_squad_one_to_many_contexts.txt")
tgt_file = os.path.join(path,"sq_squad_one_to_many_gold.txt")
out_file = os.path.join(path,"sq_squad_one_to_many_generated.txt")

main(src_file,tgt_file,out_file)

Bleu_1: 0.36160
Bleu_2: 0.19156
Bleu_3: 0.12215
Bleu_4: 0.08395
ROUGE_L: 0.33543


In [52]:
src_file = os.path.join(path,"sq_squad_one_to_many_multiple_contexts.txt")
tgt_file = os.path.join(path,"sq_squad_one_to_many_multiple_gold.txt")
out_file = os.path.join(path,"sq_squad_one_to_many_multiple_generated.txt")

main(src_file,tgt_file,out_file)

Bleu_1: 0.36712
Bleu_2: 0.19933
Bleu_3: 0.12867
Bleu_4: 0.08800
ROUGE_L: 0.34291


## Comparing all (sentence, question)  with selected gold standard (Du et al. 2017)

In [54]:
src_file = os.path.join(path,"sq_squad_all_gold.txt")
tgt_file = os.path.join(path,"sq_squad_all_gold.txt")
out_file = os.path.join(path,"sq_squad_all_generated.txt")

main(src_file,tgt_file,out_file)

Bleu_1: 0.36952
Bleu_2: 0.20219
Bleu_3: 0.13096
Bleu_4: 0.08991
ROUGE_L: 0.34651


## Comparing (sentence, answer, question) input with gold standard

In [55]:
src_file = os.path.join(path,"sqa_index.txt")
tgt_file = os.path.join(path,"sqa_gold.txt")
out_file = os.path.join(path,"sqa_generated.txt")

main(src_file,tgt_file,out_file)

Bleu_1: 0.34180
Bleu_2: 0.22427
Bleu_3: 0.15740
Bleu_4: 0.11352
ROUGE_L: 0.31966


# Narrative QA

In [38]:
src_file = os.path.join(path,"nqa_indices.txt")
tgt_file = os.path.join(path,"nqa_gold.txt")
out_file = os.path.join(path,"nqa_generated.txt")

main(src_file,tgt_file,out_file)

Bleu_1: 0.30177
Bleu_2: 0.19354
Bleu_3: 0.13226
Bleu_4: 0.09305
ROUGE_L: 0.31339


## Narrate QA with modified Data Loader

In [5]:
src_file = os.path.join(path,"nqa_sanat_index.txt")
tgt_file = os.path.join(path,"nqa_sanat_gold.txt")
out_file = os.path.join(path,"nqa_sanat_generated.txt")

main(src_file,tgt_file,out_file)

Bleu_1: 0.26913
Bleu_2: 0.16389
Bleu_3: 0.10558
Bleu_4: 0.07024
ROUGE_L: 0.27481
