In [1]:
from typing import List, Tuple, Dict, Any, Optional

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoConfig
from sentence_transformers import SentenceTransformer, CrossEncoder


def semantic_answer_similarity(predictions: List[List[str]],
                               gold_labels: List[List[str]],
                               sas_model_name_or_path: str = "cross-encoder/stsb-roberta-large"
#                                sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
                               ) -> Tuple[List[float],List[float]]:
    """
    Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.
    :param predictions: Predicted answers as list of multiple preds per question
    :param gold_labels: Labels as list of multiple possible answers per question
    :param sas_model_name_or_path: SentenceTransformers semantic textual similarity model, should be path or string
                                     pointing to downloadable models.
    """
    assert len(predictions) == len(gold_labels)

    config = AutoConfig.from_pretrained(sas_model_name_or_path)
    cross_encoder_used = False
    if config.architectures is not None:
        cross_encoder_used = any([arch.endswith('ForSequenceClassification') for arch in config.architectures])

    # Compute similarities
    top_1_sas = []
    top_k_sas = []

    # Based on Modelstring we can load either Bi-Encoders or Cross Encoders.
    # Similarity computation changes for both approaches
    if cross_encoder_used:
        model = CrossEncoder(sas_model_name_or_path)
        for preds, labels in zip (predictions,gold_labels):
            # TODO add efficient batch mode: put all texts and labels into grid and extract scores afterwards
            grid = []
            for p in preds:
                for l in labels:
                    grid.append((p,l))
            scores = model.predict(grid)
            top_1_sas.append(np.max(scores[:len(labels)]))
            top_k_sas.append(np.max(scores))
    else:
        # For Bi-encoders we can flatten predictions and labels into one list
        model = SentenceTransformer(sas_model_name_or_path)
        lengths: List[Tuple[int,int]] = []
        all_texts: List[str] = []
        for p, l in zip(predictions, gold_labels):                                  # type: ignore
            # TODO potentially exclude (near) exact matches from computations
            all_texts.extend(p)
            all_texts.extend(l)
            lengths.append((len(p), len(l)))
        # then compute embeddings
        embeddings = model.encode(all_texts)

        # then select which embeddings will be used for similarity computations
        current_position = 0
        for i, (len_p, len_l) in enumerate(lengths):
            pred_embeddings = embeddings[current_position:current_position + len_p, :]
            current_position += len_p
            label_embeddings = embeddings[current_position:current_position + len_l, :]
            current_position += len_l
            scores = cosine_similarity(pred_embeddings, label_embeddings)

    return scores

In [2]:
predictions = [["The main arguments of the Roe vs. Wade case were that the U.S. Constitution protects the right to an abortion.",
               "Roe v Wade Roe v Wade Roe v. Wade Roe v. Wade Roe v. Wade Roe v. Wade Roe v. Wade and Doe v. Bolton Roe v. Wade and Doe v. Bolton Roe v. Wade and Doe v. Bolton Roe v. Wade and Doe v. Bolton Roe v. Wade."]]
gt = [["The Court ruled, in a 7-2 decision, that a woman's right to choose an abortion was protected by the privacy rights guaranteed by the Fourteenth Amendment to the U.S. Constitution."]]
semantic_answer_similarity(predictions, gt)

array([0.5875769, 0.5332216], dtype=float32)

## SCAI-QReCC

In [3]:
# load gt
import json

path = '../../dataset/qrecc-test.json'

with open(path) as f: 
    data = json.load(f)

In [4]:
# load all samples with model labels
import os

path = '../../runs/'

samples = {}
for filename in os.listdir(path):
    if filename.endswith('.json'):
        # load model run
        with open(path+filename, encoding="utf-8") as f:
#             print(filename)
            model_label = filename.split('.')[0]
#             print(model_label)
            # load samples for this run
            run = json.load(f)
            for sample in run:
                sample_id = "%d_%d" % (sample['Conversation_no'], sample['Turn_no']) 
                if sample_id not in samples:
                    samples[sample_id] = {}
                answer = sample['Model_answer']
                samples[sample_id][model_label] = answer
                
print('\n %d samples loaded' % len(samples))


 16736 samples loaded


In [5]:
# Word tokenization
import spacy
from spacy.lang.en import English

spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
nlp = spacy.load("en_core_web_sm")


def bower(text):
    return list(set([token.lemma_ for token in nlp(text.lower()) if token.is_stop==False if token.is_punct==False]))

In [8]:
for i, sample in enumerate(data):
    sample_id = "%d_%d" % (sample['Conversation_no'], sample['Turn_no'])
    if sample_id in samples:
        models = list(samples[sample_id].keys())
        predictions = list(samples[sample_id].values())
        gt = sample['Answer']
        sas = semantic_answer_similarity([predictions], [[gt]])
        answers, ps = [bower(gt)], []
        for j, p in enumerate(predictions):
            bow = bower(p)
            if sas[j] > 0.7 and bow not in answers:
                ps.append(j)
                answers.append(bow)
        if len(answers) > 3:
            for j in ps:
                print(sample_id, '\t', sample['Rewrite'], '\t', sample['Answer'],
                      '\t', predictions[j], '\t', sas[j], '\t', models[j])

1_1 	 What is a physician's assistant? 	 physician assistants are medical providers who are licensed to diagnose and treat illness and disease and to prescribe medication for patients 	 A physician's assistant (PA) is a medical professional who provides medical care for patients. 	 0.7747502 	 rachael-scai-qrecc21-test-dataset-2021-07-20-rachael-no-run-identifier-2021-09-08-07-09-57
1_1 	 What is a physician's assistant? 	 physician assistants are medical providers who are licensed to diagnose and treat illness and disease and to prescribe medication for patients 	 A physician assistant is a medical professional who assists a doctor in the diagnosis and treatment of a patient. 	 0.71656144 	 ultron-scai-qrecc21-test-dataset-rewritten-2021-07-20-ultron-no-run-identifier-2021-09-08-15-04-28
1_1 	 What is a physician's assistant? 	 physician assistants are medical providers who are licensed to diagnose and treat illness and disease and to prescribe medication for patients 	 A physician's 

KeyboardInterrupt: 

In [None]:
# Task 1. Answering.
# 1. Does this response answer the question? 
## *Yes *No *Malformed
# 2. Remarks

# Task 2. Grounding.
# 1. Mark the passages that contain the evidence supporting this answer.
# 2. Remarks


# 2. If yes: is this response supported by the evidence provided above?
## * Supported * Unsupported
# 3. If supported: provide the ID(s) of the passage(s) that supports the response.
# 4. Remarks