In [1]:
import json
from rouge import Rouge
import string, re

In [2]:
def read_hospitalityData(path):
    with open(path, 'rb') as f:
        squad = json.load(f)

    # initialize list where we will place all of our data
    hospitality_jum = []
    
    for group in squad['data']:
        for paragraph in group['paragraphs']:
            # we pull out the context from here
            context = paragraph['context']
            for qa_pair in paragraph['qas']:
                # we pull out the question
                question = qa_pair['question']
                # now the logic to check if we have 'answers' or 'plausible_answers'
                if 'answers' in qa_pair.keys() and len(qa_pair['answers']) > 0:
                    answer = qa_pair['answers'][0]['text']
                elif 'plausible_answers' in qa_pair.keys() and len(qa_pair['plausible_answers']) > 0:
                    answer = qa_pair['plausible_answers'][0]['text']
                else:
                    # this shouldn't happen, but just in case we just set answer = None
                    answer = None
                # append dictionary sample to parsed squad
                hospitality_jum.append({
                    'question': question,
                    'answer': answer,
                    'context': context
                })
    # return formatted data lists
    return hospitality_jum

In [3]:
# Loading dataset
hospitalityDataset = read_hospitalityData('data/f21mp_test.json')

In [5]:
# Method for cleaning the prediction values
def normalize_text(s):
    """Typically, text processing steps include removing articles and punctuation and standardizing whitespace."""
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

# Method to compute  Avg EM evaluation
def compute_exact_match(model_out,reference):
    em = []    
    for i in range(len(model_out)):
        if normalize_text(model_out[i]) == normalize_text(reference[i]):
            em.append(1)
        else:
            em.append(0)    
    return sum(em)/len(em)
# Method to compute EM for each prediction
def compute_exact_match_eachPredictions(model_out,reference):
    em = []    
    for i in range(len(model_out)):
        if normalize_text(model_out[i]) == normalize_text(reference[i]):
            em.append([normalize_text(model_out[i]),normalize_text(reference[i]),1])
        else:
            em.append([normalize_text(model_out[i]),normalize_text(reference[i]),0])  
    return em

def compute_rouge_evaulation(model_out,reference):    
    rouge = Rouge()
    return rouge.get_scores(model_out, reference, avg=True)

In [6]:
from tqdm import tqdm

model_out = []
reference = []

for pair in tqdm(hospitalityDataset, leave=True):
    ans = qa({
        'question': pair['question'],
        'context': pair['context']
    })
    # append the prediction and reference to the respective lists
    model_out.append(ans['answer'])
    reference.append(pair['answer'])
    ans

100%|██████████| 18/18 [00:03<00:00,  4.75it/s]


In [7]:
em_score=compute_exact_match(model_out,reference)

print(f"Model {modelname} average exact match score {em_score}")

Model deepset/bert-large-uncased-whole-word-masking-squad2 average exact match score 0.5555555555555556


In [8]:
rouge_score=compute_rouge_evaulation(model_out,reference)
f1_score=rouge_score['rouge-l']['f']
print(f"Model {modelname} average F1-Score score {f1_score}")

Model deepset/bert-large-uncased-whole-word-masking-squad2 average F1-Score score 0.7055555514570986
