In [1]:
# load generated answers with plausibility annotations
# download file from https://zenodo.org/record/5752542#.Yau-Li8RppQ
import pandas as pd

df = pd.read_csv('answer_plausibility_annotations_clean_without_disagreements.csv')
print(len(df), 'rows')

# subset of all plausible answers
df_plausible = df[df['Annotation']=='Plausible']
print(len(df_plausible), 'rows')

2078 rows
1863 rows


In [2]:
# get annotated passages for grounding
import sys
from collections import defaultdict

import csv
import codecs

csv.field_size_limit(sys.maxsize)

path = '../annotation-passage-retrieval/'

passages = defaultdict(list)


with codecs.open(path+'passages-ground-truth.tsv', 'r', 'utf-8') as tsv_file:
    tsv_lines = csv.reader(tsv_file, delimiter='\t')
    for i, tsv_line in enumerate(tsv_lines):
        turn_id, p_id, passage = tsv_line
        if [p_id, passage] not in passages[turn_id]:
            passages[turn_id].append([p_id, passage])

with codecs.open(path+'passages-batches-0-X-plausible.tsv', 'r', 'utf-8') as tsv_file:
    tsv_lines = csv.reader(tsv_file, delimiter='\t')
    for i, tsv_line in enumerate(tsv_lines):
        turn_id, p_id, passage = tsv_line
        if [p_id, passage] not in passages[turn_id]:
            passages[turn_id].append([p_id, passage])

print(len(passages), 'questions with at least one retrieved passage')

8456 questions with at least one retrieved passage


In [30]:
# import nltk
# nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/sv/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [32]:
# find extractive answers that are close to the original passage text
# iterate over samples and extract spans (ground answers)
import json
import nltk
from span_heuristic import find_closest_span_match

RELEVANCE_THRESHOLD = 0.8

# load human answers
answers = {}
with open('./generated_evidence.tsv', 'w') as fout: 
    for index, row in df_plausible.iterrows():
        question_id = row['Input.id']
        answer = row['Input.predicted']
        rewrite = row['Input.question']
        model = row['Input.model']
        retrieved_passages = passages[question_id]
        
        if question_id != '1_1':
            continue
        
        if question_id not in answers:
            answers[question_id] = {'generated': [], 'extracted': []}
        if answer not in answers[question_id]['generated']:
            answers[question_id]['generated'].append(answer)
        
        # look for best ground for this candidate answer across all retrieved passages
        best_score = 0
        for p_id, passage in retrieved_passages:
            best_span, precontext, postcontext, diff, best_f1 = find_closest_span_match(passage, answer, 
                                                                                        pad=10, context_window=200)
            if best_f1 >= best_score:
                result = (best_span, precontext, postcontext, diff, best_f1)
                best_score = best_f1
#             if best_f1 >= RELEVANCE_THRESHOLD:
#             if question_id == '201_1':
        best_span, precontext, postcontext, diff, best_f1 = result
        print(answer)
        
        # trim precontext if available
        if precontext:
            precontext = precontext.split('.', 1)[-1].lstrip()
        
        # cut postcontenxt to the last .
        postcontext = postcontext.rsplit('.', 1)[0] + '.'

        evidence = precontext + best_span + postcontext
        sent_text = nltk.sent_tokenize(evidence) # this gives us a list of sentences

        for s in sent_text:
            if s not in answers[question_id]['extracted']:
                answers[question_id]['extracted'].append(s)
        
#         print(best_f1, '\n')
#         fout.write('\t'.join([question_id, rewrite,
#                               model, answer, str(len(answer)),
#                               p_id, best_span, precontext, postcontext,
#                               str(diff), str(len(diff)), str(best_f1)]) + '\n')
#         break

A physician assistant is a person who has successfully completed an accredited education program for physician assistant, is licensed by the state and is practicing within the scope of that license.
A physician's assistant (PA) is a medical assistant who works under the supervision of a physician and is licensed to practice medicine in the state in which the patient resides.
A physician's assistant (PA) is a medical assistant.
A physician's assistant is a person who assists a physician in the performance of his or her duties.
A physician's assistant (PA) is a medical assistant who works for a supervising physician who is responsible for patients, cared for by the physician assistant.
A physician assistant is a medical professional who assists a doctor in the diagnosis and treatment of a patient.
A physician's assistant is a medical assistant.
A physician's assistant is a medical professional who provides medical care to patients.
A physician's assistant (PA) is a medical professional w

In [33]:
for a in answers['1_1']['extracted']:
    print(a)

Also we’ve found the top three ranked university programs for aspiring physician assistants.
What Is a PA?
A physician assistant is a certified and state-licensed healthcare professional who works as part of a physician’s team.
This definition is courtesy of the American Academy of Physician Assistants (AAPA).
A licensed PA can independently practice medicine and provide treatment for patients nationally.
- .
�The Physician Assistant.
will: Assist in the evaluation of the surgical patient in the ED, inpatient... as a Physician Assistant by the State of New York Department of Education Board Certified by the National...
" What is the working relationship between a physician and a physician assistant?
The relationship between a PA and the supervising physician is one of mutual trust and respect.
The physician assistant is a representative of the physician, treating the patient in the style and manner developed and directed by the supervising physician.
The physician and PA practice as me

In [34]:
# filter extracted sentences with the sas score
from typing import List, Tuple, Dict, Any, Optional

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoConfig
from sentence_transformers import SentenceTransformer, CrossEncoder


def semantic_answer_similarity(predictions: List[List[str]],
                               gold_labels: List[List[str]],
                               sas_model_name_or_path: str = "cross-encoder/stsb-roberta-large"
#                                sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
                               ) -> Tuple[List[float],List[float]]:
    """
    Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.
    :param predictions: Predicted answers as list of multiple preds per question
    :param gold_labels: Labels as list of multiple possible answers per question
    :param sas_model_name_or_path: SentenceTransformers semantic textual similarity model, should be path or string
                                     pointing to downloadable models.
    """
    assert len(predictions) == len(gold_labels)

    config = AutoConfig.from_pretrained(sas_model_name_or_path)
    cross_encoder_used = False
    if config.architectures is not None:
        cross_encoder_used = any([arch.endswith('ForSequenceClassification') for arch in config.architectures])

    # Compute similarities
    top_1_sas = []
    top_k_sas = []

    # Based on Modelstring we can load either Bi-Encoders or Cross Encoders.
    # Similarity computation changes for both approaches
    if cross_encoder_used:
        model = CrossEncoder(sas_model_name_or_path)
        for preds, labels in zip (predictions, gold_labels):
            # TODO add efficient batch mode: put all texts and labels into grid and extract scores afterwards
            grid = []
            for p in preds:
                for l in labels:
                    grid.append((p,l))
            scores = model.predict(grid)
#             top_1_sas.append(np.max(scores[:len(labels)]))
#             top_k_sas.append(np.max(scores))
    else:
        # For Bi-encoders we can flatten predictions and labels into one list
        model = SentenceTransformer(sas_model_name_or_path)
        lengths: List[Tuple[int,int]] = []
        all_texts: List[str] = []
        for p, l in zip(predictions, gold_labels):                                  # type: ignore
            # TODO potentially exclude (near) exact matches from computations
            all_texts.extend(p)
            all_texts.extend(l)
            lengths.append((len(p), len(l)))
        # then compute embeddings
        embeddings = model.encode(all_texts)

        # then select which embeddings will be used for similarity computations
        current_position = 0
        for i, (len_p, len_l) in enumerate(lengths):
            pred_embeddings = embeddings[current_position:current_position + len_p, :]
            current_position += len_p
            label_embeddings = embeddings[current_position:current_position + len_l, :]
            current_position += len_l
            scores = cosine_similarity(pred_embeddings, label_embeddings)

    return scores

In [35]:
# get extracted spans that are similar to the generated answers
scores = semantic_answer_similarity([answers['1_1']['extracted']], [answers['1_1']['generated']])

In [36]:
print(len(scores))

209


In [39]:
scored_answers = [scores[i:i + len(answers['1_1']['generated'])] for i in range(0, len(scores), len(answers['1_1']['generated']))]

In [42]:
th = 0.8
for i, s in enumerate(scored_answers):
#     print(max(s))
    if max(s) > th:
        print(answers['1_1']['extracted'][i])

A physician assistant is a certified and state-licensed healthcare professional who works as part of a physician’s team.
A licensed PA can independently practice medicine and provide treatment for patients nationally.
�The Physician Assistant.
The physician assistant is a representative of the physician, treating the patient in the style and manner developed and directed by the supervising physician.
Regulatory Requirements: A physician assistant may provide medical services, which are delegated in writing by a supervising physician who is responsible for patients, cared for by the physician assistant.
