In [142]:
import pandas as pd
import numpy as np
import pickle
import warnings
import string
import difflib

#### Load test SQAD dataset

In [84]:
dataset = pd.read_json("czech_test_answer_sentence.json")
dataset

Unnamed: 0,answers,context,id,title,question,answer_sentence,answer_sentence_span
0,"{'answer_start': [12], 'text': ['levostranný']}",Dřevnice je levostranný přítok řeky Moravy ve ...,12242,Jaký přítok je Dřevnice?,Jaký přítok je Dřevnice?,Dřevnice je levostranný přítok řeky Moravy ve ...,"[0, 61]"
1,"{'answer_start': [0], 'text': ['ve Foxrocku']}",Samuel Barclay Beckett [Bekit] (13. dubna 1906...,2490,Kde se narodil Samuel Beckett?,Kde se narodil Samuel Beckett?,Samuel Barclay Beckett [Bekit] (13. dubna 1906...,"[0, 130]"
2,"{'answer_start': [216], 'text': ['Josef Ressel']}",Buzola (také busola) je jednoduchý přístroj pr...,5571,Který český vynálezce sestrojil první buzolu?,Který český vynálezce sestrojil první buzolu?,První buzolu sestrojil český vynálezce Josef R...,"[177, 229]"
3,"{'answer_start': [2084], 'text': ['1869']}","Deoxyribonukleová kyselina, běžně označovaná D...",7325,Ve kterém roce byla popsána deoxyribonukleová ...,Ve kterém roce byla popsána deoxyribonukleová ...,Deoxyribonukleová kyselina byla popsána roku 1...,"[2039, 2173]"
4,"{'answer_start': [168], 'text': ['strojové ins...","Centrální procesorová jednotka (zkratka CPU, a...",3770,Co vykonává procesor?,Co vykonává procesor?,"Centrální procesorová jednotka (zkratka CPU, a...","[0, 263]"
...,...,...,...,...,...,...,...
2484,"{'answer_start': [1137], 'text': ['na počátku ...","Gainax (japonsky ガ, Gainakkusu) je japonské an...",4817,Kdy bylo založeno studio Gainax?,Kdy bylo založeno studio Gainax?,Studio Gainax bylo založeno na počátku 80. let...,"[1109, 1278]"
2485,"{'answer_start': [830], 'text': ['Titan je šed...","Titan (chemická značka Ti, latinsky Titanium) ...",11707,Je titan lehký kov?,Je titan lehký kov?,"Titan je šedý až stříbřitě bílý, lehký a tvrdý...","[830, 881]"
2486,"{'answer_start': [17], 'text': ['10. března 19...",Eva Herzigová (* 10. března 1973 Litvínov) je ...,5771,Kdy se narodila Eva Herzigová?,Kdy se narodila Eva Herzigová?,Eva Herzigová (* 10. března 1973 Litvínov) je ...,"[0, 75]"
2487,"{'answer_start': [58], 'text': ['Vladimír Štan...","Michal David (* 14. července 1960 Praha), vlas...",10738,Jak se vlastním jménem jmenuje Michal David?,Jak se vlastním jménem jmenuje Michal David?,"Michal David (* 14. července 1960 Praha), vlas...","[0, 129]"


### Functions each evaluating different taks and different metric (Answer Extraction, Answer Selections and exact_match@k and partial_match@k for each)

In [133]:
def get_exact_match(answer, preds):
    #function returns tuple of booleans for
    # exact match, em in top 5, top 10, top 20
    answer = answer.strip().strip(string.punctuation)
    for i, prediction in enumerate(preds):
        #remove spaces and punctuations
        p = prediction["text"].strip().strip(string.punctuation)
        if p == answer:
            if  i + 1 == 1:
                return (True, True, True, True)
            elif i + 1 <= 5:
                return (False, True, True, True)
            elif i + 1 <= 10:
                return (False, False, True, True)
            elif i + 1 <= 20:
                return (False, False, False, True)
    return (False, False, False, False)

In [87]:
def get_coverage_match(answer, answer_start, preds):
    answer_end = answer_start + len(answer)
    #function returns tuple of booleans for
    # coverage match, cm in top 5, top 10, top 20
    cm = False
    cm5 = False
    cm10 = False
    cm20 = False
    for i, prediction in enumerate(preds):
        pred_span = tuple([prediction["text_start"], prediction["text_start"]+len(prediction["text"])])
        if (pred_span[0] < answer_start and answer_start < pred_span[1] <= answer_end) \
        or (pred_span[0] >= answer_start and pred_span[1] <= answer_end)\
        or (answer_start <= pred_span[0] < answer_end and pred_span[1] > answer_end)\
        or (pred_span[0] > answer_start and pred_span[1] < answer_end):
            if i + 1 == 1:
                cm, cm5, cm10, cm20 = True, True, True, True
            elif i + 1  <= 5 and not cm:
                cm, cm5, cm10, cm20 = False, True, True, True
            elif i + 1  <= 10 and not cm and not cm5:
                cm, cm5, cm10, cm20 = False, False, True, True
            elif i + 1  <= 20 and not cm and not cm5 and not cm10:
                cm, cm5, cm10, cm20 = False, False, False, True
        if any([cm, cm5, cm10, cm20]):
            return (cm, cm5, cm10, cm20)
    return (False, False, False, False)

In [129]:
def get_coverage_match_T5(answer, preds):
    #function returns tuple of booleans for
    # coverage match, cm in top 5, top 10, top 20
    cm = False
    cm5 = False
    cm10 = False
    cm20 = False
    for i, prediction in enumerate(preds):
        tokens = prediction["text"].split()
        for t in tokens:
            if t in answer:
                if i + 1 == 1:
                    cm, cm5, cm10, cm20 = True, True, True, True
                elif i + 1  <= 5 and not cm:
                    cm, cm5, cm10, cm20 = False, True, True, True
                elif i + 1  <= 10 and not cm and not cm5:
                    cm, cm5, cm10, cm20 = False, False, True, True
                elif i + 1  <= 20 and not cm and not cm5 and not cm10:
                    cm, cm5, cm10, cm20 = False, False, False, True
        if any([cm, cm5, cm10, cm20]):
            return (cm, cm5, cm10, cm20)
    return (False, False, False, False)

In [353]:
def get_answer_selection_em(answer_sentence_span, preds):
    #function checks if the model selected the right sentence on exact_match
    for i, prediction in enumerate(preds):
        span = tuple([prediction["text_start"], prediction["text_start"]+len(prediction["text"])])
        if span[0] >= answer_sentence_span[0] and span[1] <= answer_sentence_span[1]:
            if i + 1 == 1:
                return True, True, True, True
            elif i + 1 <= 5:
                return False, True, True, True
            elif i + 1 <= 10:
                return False, False, True, True
            elif i + 1 <= 20:
                return False, False, False, True
    return False, False, False, False

def get_answer_selection_cm(answer_sentence_span, preds):
    #function checks if the model selected the right sentence on coverage match
    for i, prediction in enumerate(preds):
        span = tuple([prediction["text_start"], prediction["text_start"]+len(prediction["text"])])
        if (span[0] < answer_sentence_span[0] and answer_sentence_span[0] < span[1] <= answer_sentence_span[1])\
        or (span[0] >= answer_sentence_span[0] and span[1] <= answer_sentence_span[1])\
        or (answer_sentence_span[0] <= span[0] < answer_sentence_span[1] and span[1] > answer_sentence_span[1])\
        or (span[0] > answer_sentence_span[0] and span[1] < answer_sentence_span[1]):
            if i + 1 == 1:
                return True, True, True, True
            elif i + 1 <= 5:
                return False, True, True, True
            elif i + 1 <= 10:
                return False, False, True, True
            elif i + 1 <= 20:
                return False, False, False, True
    return False, False, False, False
        
        

In [285]:
def evaluate_model(dataset, predictions_data, model_name):
    #evaluate model on answer selection and extraction
    exact_match = 0
    exact_match5 = 0
    exact_match10 = 0
    exact_match20 = 0

    coverage_match = 0
    coverage_match5 = 0
    coverage_match10 = 0
    coverage_match20 = 0

    answer_sentence = 0
    answer_sentence5 = 0
    answer_sentence10 = 0
    answer_sentence20 = 0

    answer_sentence_coverage = 0
    answer_sentence_coverage5 = 0
    answer_sentence_coverage10 = 0
    answer_sentence_coverage20 = 0
    for row in dataset.iloc:
        #print(row.answers)
        answer = row.answers["text"][0]
        answer_start = row.answers["answer_start"][0]
        answer_sentence_span = row.answer_sentence_span
        #print(row.id)
        predictions = predictions_data[row.id]

        em = get_exact_match(answer, predictions)
        exact_match += em[0]
        exact_match5 += em[1]
        exact_match10 += em[2]
        exact_match20 += em[3]

        cm = get_coverage_match(answer, answer_start, predictions)
        coverage_match += cm[0]
        coverage_match5 += cm[1]
        coverage_match10 += cm[2]
        coverage_match20 += cm[3]

        ans_selection = get_answer_selection_em(answer_sentence_span, predictions)
        answer_sentence += ans_selection[0]
        answer_sentence5 += ans_selection[1]
        answer_sentence10 += ans_selection[2]
        answer_sentence20 += ans_selection[3]

        ans_selection_coverage = get_answer_selection_cm(answer_sentence_span, predictions)
        answer_sentence_coverage += ans_selection_coverage[0]
        answer_sentence_coverage5 += ans_selection_coverage[1]
        answer_sentence_coverage10 += ans_selection_coverage[2]
        answer_sentence_coverage20 += ans_selection_coverage[3]
        
    results = {"model_name": model_name}
    
    results["answer_exact_match"] = exact_match/dataset.shape[0]
    results["answer_exact_match_5"] = exact_match5/dataset.shape[0]
    results["answer_exact_match_10"] = exact_match10/dataset.shape[0]
    results["answer_exact_match_20"] = exact_match20/dataset.shape[0]
    
    print("Answer extraction results for: " + model_name)
    print("EXACT MATCH:        {:0.4f}".format(exact_match/dataset.shape[0]))
    print("EXACT MATCH TOP 5:  {:0.4f}".format(exact_match5/dataset.shape[0]))
    print("EXACT MATCH TOP 10: {:0.4f}".format(exact_match10/dataset.shape[0]))
    print("EXACT MATCH TOP 20: {:0.4f}".format(exact_match20/dataset.shape[0]))

    results["answer_coverage_match"] = coverage_match/dataset.shape[0]
    results["answer_coverage_match_5"] = coverage_match5/dataset.shape[0]
    results["answer_coverage_match_10"] = coverage_match10/dataset.shape[0]
    results["answer_coverage_match_20"] = coverage_match20/dataset.shape[0]
    
    print("COVERAGE MATCH:        {:0.4f}".format(coverage_match/dataset.shape[0]))
    print("COVERAGE MATCH TOP 5:  {:0.4f}".format(coverage_match5/dataset.shape[0]))
    print("COVERAGE MATCH TOP 10: {:0.4f}".format(coverage_match10/dataset.shape[0]))
    print("COVERAGE MATCH TOP 20: {:0.4f}".format(coverage_match20/dataset.shape[0]))

    print("\nAnswer selection results for: " + model_name)
    
    results["answer_sentence_exact_match"] = answer_sentence/dataset.shape[0]
    results["answer_sentence_exact_match_5"] = answer_sentence5/dataset.shape[0]
    results["answer_sentence_exact_match_10"] = answer_sentence10/dataset.shape[0]
    results["answer_sentence_exact_match_20"] = answer_sentence20/dataset.shape[0]
    
    print("EXACT MATCH:        {:0.4f}".format(answer_sentence/dataset.shape[0]))
    print("EXACT MATCH TOP 5:  {:0.4f}".format(answer_sentence5/dataset.shape[0]))
    print("EXACT MATCH TOP 10: {:0.4f}".format(answer_sentence10/dataset.shape[0]))
    print("EXACT MATCH TOP 20: {:0.4f}".format(answer_sentence20/dataset.shape[0]))

    results["answer_sentence_coverage_match"] = answer_sentence_coverage/dataset.shape[0]
    results["answer_sentence_coverage_match_5"] = answer_sentence_coverage5/dataset.shape[0]
    results["answer_sentence_coverage_match_10"] = answer_sentence_coverage10/dataset.shape[0]
    results["answer_sentence_coverage_match_20"] = answer_sentence_coverage20/dataset.shape[0]
    
    print("COVERAGE MATCH:        {:0.4f}".format(answer_sentence_coverage/dataset.shape[0]))
    print("COVERAGE MATCH TOP 5:  {:0.4f}".format(answer_sentence_coverage5/dataset.shape[0]))
    print("COVERAGE MATCH TOP 10: {:0.4f}".format(answer_sentence_coverage10/dataset.shape[0]))
    print("COVERAGE MATCH TOP 20: {:0.4f}".format(answer_sentence_coverage20/dataset.shape[0]))
    return results

In [355]:
def run_evaluation(test_dataset):
    #run evaluation of bert-like models
    models = ["albert_czech", "bert_czech", "bert_multilingual", "bert_slavic", "electra_czech", "roberta_czech", "roberta_multilingual"]
    df = []
    for m in models:
        with open("model_predictions\\{}_predictions.json".format(m), 'rb') as fp:
            predictions_data = pickle.load(fp)
            print(100*'-'+"\n"+fp.name+"\n"+100*'-')   
            #print(predictions_data[5571])
        df += [evaluate_model(test_dataset, predictions_data, m)]
    data_frame = pd.DataFrame(df)
    return data_frame

In [287]:
with open("model_predictions\\T5_multilingual_predictions.json", 'rb') as fp:
    predictions_data = pickle.load(fp)
len(predictions_data.keys())

2489

In [288]:
#Calculate the numver of different characters in 2 strings
def get_different_chars(a, b):
    for a_ch in a:
        #print(a_ch)
        b = b.replace(a_ch,"")
    return len(b)

In [289]:
get_different_chars("", "")

0

In [356]:
def check_if_only_additional_punctuation(answer, prediction):
    #checks if prediction is relevant after stripping punctuation
    if prediction.strip().strip(string.punctuation) == answer:
        return True
    return False

#### Get the average number of different characters in predictions and answer and umber of answers which would be exact match with punctuation removed


In [357]:

total_sum_of_means = 0
number_of_q_with_punctuation = 0
for row in dataset.iloc:
    different = 0
    only_punct_different = False
    pred_answers = [x["text"] for x in predictions_data[row.id]]
    answer = row.answers["text"][0]
    flag = False
    for p in pred_answers:
        if p == answer:
            flag = True
            break
    if not flag:
        for p in pred_answers:
            different+=get_different_chars(p, answer)
            if check_if_only_additional_punctuation(answer, p):
                only_punct_different = True
        #print(answer)
        #print(pred_answers)
        #print(different/20)
        total_sum_of_means += different/20
    if only_punct_different:
        number_of_q_with_punctuation += 1
print("Number of predictions that would be correct with punctuation stripping:",number_of_q_with_punctuation)
print("Final character difference count:", total_sum_of_means/dataset.shape[0])

Number of predictions that would be correct with punctuation stripping: 1
Final character difference count: 5.434692647649656


In [350]:
# Example of mT5 generating correct answers but with different spelling and upper and lower cases
for row in dataset.iloc:
    pred_answers = [x["text"] for x in predictions_data[row.id]]
    answer = row.answers["text"][0]
    flag = False
    for p in pred_answers:
        if p == answer:
            flag = True
            break
    if not flag:
        if row.id==8312:
            print(row.id)
            print(row.question)
            print(answer)
            for x in pred_answers:
                print(x,end=", ")


8312
Na kterém pólu Země je Antarktida?
jižní
Jižním, Jižním, jižního, Jižním pólu, jižním, Jižního, na Jižním, Ježním, jižního pólu, Jižním polou, Jižní, Na Jižním, Jinžním, Jižním polu, Jižním,, Jižní, Jižním?, jižním, Jižním, Jinžního, 

In [358]:
def evaluate_model_T5(dataset, predictions_data):
    #evaluate T5 model on answer selection and extracton
    exact_match = 0
    exact_match5 = 0
    exact_match10 = 0
    exact_match20 = 0

    coverage_match = 0
    coverage_match5 = 0
    coverage_match10 = 0
    coverage_match20 = 0
    
    answer_sentence = 0
    answer_sentence5 = 0
    answer_sentence10 = 0
    answer_sentence20 = 0
    
    answer_sentence_coverage = 0
    answer_sentence_coverage5 = 0
    answer_sentence_coverage10 = 0
    answer_sentence_coverage20 = 0

    for row in dataset.iloc:
        #print(row.answers)
        answer = row.answers["text"][0]
        predictions = predictions_data[row.id]
        answer_sent = row["answer_sentence"]

        em = get_exact_match(answer, predictions)
        exact_match += em[0]
        exact_match5 += em[1]
        exact_match10 += em[2]
        exact_match20 += em[3]

        cm = get_coverage_match_T5(answer, predictions)
        coverage_match += cm[0]
        coverage_match5 += cm[1]
        coverage_match10 += cm[2]
        coverage_match20 += cm[3]

       
    print("Answer extraction results for T5_multilingual ")
    
    print("EXACT MATCH:        {:0.4f}".format(exact_match/dataset.shape[0]))
    print("EXACT MATCH TOP 5:  {:0.4f}".format(exact_match5/dataset.shape[0]))
    print("EXACT MATCH TOP 10: {:0.4f}".format(exact_match10/dataset.shape[0]))
    print("EXACT MATCH TOP 20: {:0.4f}".format(exact_match20/dataset.shape[0]))
    
    print("COVERAGE MATCH:        {:0.4f}".format(coverage_match/dataset.shape[0]))
    print("COVERAGE MATCH TOP 5:  {:0.4f}".format(coverage_match5/dataset.shape[0]))
    print("COVERAGE MATCH TOP 10: {:0.4f}".format(coverage_match10/dataset.shape[0]))
    print("COVERAGE MATCH TOP 20: {:0.4f}".format(coverage_match20/dataset.shape[0]))

In [None]:
evaluate_model_T5(dataset, predictions_data)

In [None]:
df = run_evaluation(dataset)

In [104]:
df.describe()

Unnamed: 0,answer_exact_match,answer_exact_match_5,answer_exact_match_10,answer_exact_match_20,answer_coverage_match,answer_coverage_match_5,answer_coverage_match_10,answer_coverage_match_20,answer_sentence_exact_match,answer_sentence_exact_match_5,answer_sentence_exact_match_10,answer_sentence_exact_match_20,answer_sentence_coverage_match,answer_sentence_coverage_match_5,answer_sentence_coverage_match_10,answer_sentence_coverage_match_20
count,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0
mean,0.679447,0.816909,0.854044,0.882512,0.820238,0.911955,0.937324,0.957929,0.840154,0.901854,0.922918,0.940596,0.86822,0.937037,0.9577,0.97268
std,0.074483,0.071815,0.068275,0.061923,0.067374,0.052642,0.039264,0.02944,0.08099,0.062563,0.051935,0.041193,0.062505,0.042081,0.028761,0.019253
min,0.564484,0.705504,0.745681,0.78626,0.718361,0.829249,0.880273,0.913218,0.719566,0.802732,0.838489,0.875854,0.769787,0.867818,0.912816,0.944154
25%,0.634793,0.769385,0.809361,0.839293,0.780032,0.878666,0.9092,0.938128,0.778827,0.857172,0.887907,0.911008,0.830655,0.91141,0.93853,0.95902
50%,0.70229,0.849337,0.882282,0.912816,0.838489,0.937324,0.955002,0.969064,0.87505,0.936119,0.951386,0.964644,0.887907,0.957011,0.971474,0.982322
75%,0.732624,0.872037,0.908196,0.9319,0.871233,0.94998,0.967256,0.981117,0.904178,0.949578,0.963037,0.972077,0.915428,0.967256,0.97951,0.986943
max,0.75452,0.880675,0.915227,0.936119,0.882282,0.959823,0.973082,0.984733,0.92045,0.960627,0.968662,0.977501,0.927682,0.977099,0.983528,0.990358


In [27]:
df.to_csv("evaluated_models.csv")

# Mean Reciprocal Rank and Mean Average Precision

In [359]:
def get_mrr_rank(answer, predictions, answer_sentence):
    #return mrr for given question
    for i, p in enumerate(predictions):
        if answer.strip().strip(string.punctuation) in p["text"] or p["text"].strip().strip(string.punctuation) in answer_sentence:
            return 1/(i+1)
    return 0

In [360]:
def average_precision(binary_vector):
    #return average precision for given binary vector of predictions
    m = 0
    precs = []
    ap = 0
    for i, val in enumerate(binary_vector):
        if val == 1:
            m += 1
            precs.append(sum(binary_vector[:i+1])/(i+1))
    ap = (1/m)*np.sum(precs) if m else 0
    return ap

def get_average_precision(answer, predictions, answer_sentence):
    #return average precision for given question
    bin_vector = [int(answer.strip() in prediction["text"] or prediction["text"].strip() in answer_sentence) for prediction in predictions]
    return average_precision(bin_vector)

In [361]:
def evaluate_mrr_and_map(dataset, predictions_data):
    #calculate MRR and MAP for model predictions
    ap = 0
    rr = 0
    emm = 0
    for row in dataset.iloc:
        answer = row.answers["text"][0]
        answer_sentence = row.answer_sentence
        predictions = predictions_data[row.id]
        ap += get_average_precision(answer, predictions, answer_sentence)
        rr += get_mrr_rank(answer, predictions, answer_sentence)
    mrr = rr/dataset.shape[0]
    mAP = ap/dataset.shape[0]
    print("mAP: ", mAP)
    print("mRR: ", mrr)

In [363]:
def run_evaluation_map_mrr(test_dataset):
    #evaluate MAP and MRR
    models = ["albert_czech", "bert_czech", "bert_multilingual", "bert_slavic", "electra_czech", "roberta_czech"]
    df = []
    for m in models:
        with open("model_predictions\\{}_predictions.json".format(m), 'rb') as fp:
            predictions_data = pickle.load(fp)
            print(50*'-'+"\n"+fp.name+"\n"+50*'-')            
        print(m)
        evaluate_mrr_and_map(dataset, predictions_data)

In [None]:
run_evaluation_map_mrr(dataset)