# Prediction insights

In [None]:
import json
import random
import re
import string

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', 0)
from statsmodels.stats.contingency_tables import mcnemar

## Overview

[1. Analyze errors of one experiment](#1)<br>
&nbsp; [1.1. Differences between True and Pred](#1-1)<br>
&nbsp; [1.2. Analyze mistakes](#1-2)<br>
&nbsp; [1.3. Best answer with probability](#1-3)<br>
&nbsp; [1.4. Search for  faulty testcases](#1-4)<br>
[2. Compare errors of two experiments](#2)<br>
&nbsp; [2.1. Differences between True and Pred](#2-1)<br>
&nbsp; [2.2. Looking into ids of same and different errors](#2-2)<br>
&nbsp;&nbsp;&nbsp;&nbsp; [2.2.1. Deeper analysis: Randomly looking into errors](#2-2-1)<br>
&nbsp;&nbsp;&nbsp;&nbsp; [2.2.2. Deeper analysis: N-Best Prediction analysis](#2-2-2)<br>
&nbsp; [2.3. McNemar Test](#2-3)<br>

#### Helper functions

In [None]:
def calculate_p_value(diffs1, diffs2):
    
    # build contigency table
    diff1_incorrect_diff2_correct = 0
    
    for _, row in diffs1.iterrows():
        if row["id"] not in diffs2["id"].values:
            diff1_incorrect_diff2_correct += 1
                    
    diff1_correct_diff2_incorrect = 0
    
    for _, row in diffs2.iterrows():
        if row["id"] not in diffs1["id"].values:
            diff1_correct_diff2_incorrect += 1
    
    contigency_table = [
        [0, diff1_correct_diff2_incorrect], 
        [diff1_incorrect_diff2_correct, 0]
    ]
    
    return mcnemar(contigency_table, exact=True, correction=True).pvalue

def get_diffs_dfs(new_test):
    new_test["preds"] = new_test["preds"].astype(str)
    new_test["true_answer"] = new_test["true_answer"].astype(str)
    
    diffs = new_test[new_test.preds != new_test.true_answer]
    
    new_test2 = new_test.copy()
    new_test2["preds"] = new_test2.apply(lambda row: normalize_answer(row.preds), axis=1)
    new_test2["true_answer"] = new_test2.apply(lambda row: normalize_answer(row.true_answer), axis=1)
    normalized_diffs = new_test2[new_test2.preds != new_test2.true_answer]
    
    return diffs, normalized_diffs


def get_test_and_preds(path, test_path):
    with open(path+"test_predictions.json", "r") as f:
        preds = json.load(f)
    
    test = pd.read_json(path_or_buf=test_path, lines=True)
    return test, preds

def get_top1_answer_probability(nbest_preds):
    new_nbest_preds = {new_id: l[0] for new_id, l in nbest_preds.items()}
    
    df = pd.DataFrame.from_dict(new_nbest_preds, orient="index")
    df.reset_index(level=0, inplace=True)
    df.columns = ["id", "start_logit", "end_logit", "text", "probability"]
    return df

def join_test_preds(test, preds):
    """Add the test prediction to the test dataframe."""
    pred_df = pd.DataFrame(preds.items())
    pred_df.columns = ["id", "preds"]
    
    new_test = test.set_index('id').join(pred_df.set_index('id'))
    new_test.reset_index(level=0, inplace=True)
    new_test.columns = ["id", "orig_id", "title", "context", "fixed", "question", "answers", "preds"]
    new_test["true_answer"] = new_test.apply(lambda row: dict(row.answers)["text"][0], axis=1)
    del new_test["title"]
    
    def replace_empty_string(row):
        if row["preds"] == "":
            return "EMPTY"
        else:
            return row["preds"]
    
    new_test["preds"] = new_test.apply(lambda row: replace_empty_string(row), axis=1)
    
    return new_test

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def white_space_fix(text):
        return " ".join(text.split())

    return white_space_fix(s)

## 1. Analyze errors of one experiment <a id="1"></a>

In [None]:
# Load test set and preds
PATH = "../results_crawl_na_big_rework_thin/crawl-thin-na/xlm-roberta-base/"
TEST_PATH = f"crawl-thin-na/crawl-thin-na-test.jsonl"

In [None]:
%%time
test, preds = get_test_and_preds(PATH, TEST_PATH)

### 1.1. Differences between True and Pred <a id="1-1"></a>

In [None]:
new_test = join_test_preds(test, preds)
new_test.head(1)

In [None]:
diffs, normalized_diffs = get_diffs_dfs(new_test)

In [None]:
print(f"There are {len(diffs)} differences between the true and pred answers (no normalizing)!")
print(f"There are {len(normalized_diffs)} differences between the true and pred answers (WITH normalizing)!")

### 1.2. Analyze mistakes <a id="1-2"></a>

Two DataFrames:
1. one with all mistakes (`diffs`)
2. one without substring answers where neither TRUE is a substring of PRED nor PRED is a substring of TRUE (`diffs_no_sub`). So the following is NOT included:
    - PRED: `industrial avenue 22-24` 
    - TRUE: `industrial avenue 22-24 ilupeju`

In [None]:
diffs_no_sub = new_test[(~new_test.preds.isin(new_test.true_answer))|(~new_test.true_answer.isin(new_test.preds))]

print("Count of wrong predictions:", diffs.shape[0])
print("Count of wrong predictions (no substring match):", diffs_no_sub.shape[0])

In [None]:
# Use this cell for a detailed analysis

n = input()
if n == "" or n == "r":
    n = random.randint(0, len(diffs_no_sub))
    print("n:", n)
current_error = diffs_no_sub.iloc[int(n)-1:int(n)]
context = current_error.iloc[0].context
print(f"Length of the context: {len(context)}")
print(current_error.iloc[0]["id"])
current_error.loc[:, current_error.columns != 'context']

In [None]:
max_char = -1
context[:max_char]

### 1.3. Best answer with probability <a id="1-3"></a>

In [None]:
with open(PATH+"test_nbest_predictions.json", "r") as f:
    nbest_preds = json.load(f)

In [None]:
target_id = "69243-0"
answers = [v for k,v in nbest_preds.items() if k.startswith(target_id)]
nbest_preds_df = pd.DataFrame(answers[0]).sort_values(by="probability", ascending=False)
nbest_preds_df.head()

Use the following DataFrame to investigate the models probabilites for the top 1 answer (= returned pred answer).

In [None]:
top1_probs = get_top1_answer_probability(nbest_preds)

no_empty_answers = True

if no_empty_answers:
    top1_probs = top1_probs[top1_probs.text != ""]

top1_probs.sort_values(by="probability", ascending=False).head(10)

### 1.4. Search for  faulty testcases <a id="1-4"></a>

In [None]:
diffs.orig_id.value_counts()[:30].plot(kind="bar")

In [None]:
orig_id = 69362

diffs[diffs["orig_id"] == orig_id]

## 2. Compare errors of two experiments <a id="2"></a>

Change `PATH1`, `PATH2` and `TEST_PATH1` for your needs.

In [None]:
PATH1 = "../results-archive/results_crawl_na_big_rework/crawl-na/xlm-roberta-base/"
PATH2 = "../results-archive/results_exp_crawl_na_ft_da_10epochs/crawl-mlm/"

TEST_PATH1 = f"crawl-na/crawl-na-test.jsonl"

In [None]:
%%time

test1, preds1 = get_test_and_preds(PATH1, TEST_PATH1)
test2, preds2 = get_test_and_preds(PATH2, TEST_PATH1)

### 2.1. Differences between True and Pred <a id="2-1"></a>

Normalizing = fixing white space errors. So `diffs` and `normalized_diffs` should ideally be the same!

In [None]:
new_test1 = join_test_preds(test1, preds1)
new_test2 = join_test_preds(test2, preds2)

In [None]:
diffs1, normalized_diffs1 = get_diffs_dfs(new_test1)
diffs_no_sub1 = new_test1[(~new_test1.preds.isin(new_test1.true_answer))|(~new_test1.true_answer.isin(new_test1.preds))]

print("(1) Count of wrong predictions:", diffs1.shape[0])
print("(1) Count of wrong predictions (with normalizing):", normalized_diffs1.shape[0])
print("(1) Count of wrong predictions (no substring match):", diffs_no_sub1.shape[0])

print()

diffs2, normalized_diffs2 = get_diffs_dfs(new_test2)
diffs_no_sub2 = new_test2[(~new_test2.preds.isin(new_test2.true_answer))|(~new_test2.true_answer.isin(new_test2.preds))]

print("(2) Count of wrong predictions:", diffs2.shape[0])
print("(2) Count of wrong predictions (with normalizing):", normalized_diffs2.shape[0])
print("(2) Count of wrong predictions (no substring match):", diffs_no_sub2.shape[0])

### 2.2. Looking into ids of same and different errors <a id="2-2"></a>

Based on normalized diffs (substring match is possible).

In [None]:
same_ids = []
unique_ids_test1 = []
unique_ids_test2 = []

for _, row in normalized_diffs1.iterrows():
    rid = row["id"]
    if rid in normalized_diffs2["id"].values:
        same_ids.append(rid)
    else:
        unique_ids_test1.append(rid)
        
for _, row in normalized_diffs2.iterrows():
    rid = row["id"]
    if rid in normalized_diffs1["id"].values:
        if rid not in same_ids:
            same_ids.append(rid)
    else:
        unique_ids_test2.append(rid)

In [None]:
print(f"Percentage of the same errors on the basis of Test-1: {np.round(len(same_ids)/len(normalized_diffs1), decimals=3)}")
print(f"Percentage of the same errors on the basis of Test-2: {np.round(len(same_ids)/len(normalized_diffs2), decimals=3)}")
#print(f"Percentage of the different errors: {np.round(1 - (len(same_ids)/len(normalized_diffs1)), decimals=3)}")

In [None]:
unique_errors_diffs1 = normalized_diffs1[normalized_diffs1["id"].isin(unique_ids_test1)]
unique_errors_diffs2 = normalized_diffs2[normalized_diffs2["id"].isin(unique_ids_test2)]

#print("")
print(f"There are {len(unique_errors_diffs1)} unique errors for Test-1 when compared with Test-2.")
print(f"There are {len(unique_errors_diffs2)} unique errors for Test-2 when compared with Test-1.")

In [None]:
# filter rune errors (should ideally be the same as unique_errors_diffs)
unique_errors_diffs1_rune = unique_errors_diffs1[~unique_errors_diffs1.true_answer.str.contains("ᛉ")]
print(f"There are overall {len(unique_errors_diffs1_rune)} different errors when comparing the two test dataframes (filtering runes).")

unique_errors_diffs2_rune = unique_errors_diffs2[~unique_errors_diffs2.true_answer.str.contains("ᛉ")]
print(f"There are overall {len(unique_errors_diffs2_rune)} different errors when comparing the two test dataframes (filtering runes).")

#### 2.2.1. Deeper analysis: Randomly looking into errors  <a id="2-2-1"></a>

In [None]:
# for test1

n = input()
if n == "" or n == "r":
    n = random.randint(0, len(unique_errors_diffs1))
    print("n:", n)
current_error = unique_errors_diffs1.iloc[int(n)-1:int(n)]
print(current_error.iloc[0]["id"])
current_error

In [None]:
# for test2

n = input()
if n == "" or n == "r":
    n = random.randint(0, len(unique_errors_diffs1))
    print("n:", n)
current_error2 = unique_errors_diffs2.iloc[int(n)-1:int(n)]
print(current_error2.iloc[0]["id"])
current_error2

#### 2.2.2. Deeper analysis: N-Best Prediction analysis  <a id="2-2-2"></a>

In [None]:
%%time

with open(PATH1+"test_nbest_predictions.json", "r") as f:
    nbest_preds1 = json.load(f)
    
with open(PATH2+"test_nbest_predictions.json", "r") as f:
    nbest_preds2 = json.load(f)

In [None]:
given_id = "69669-0"

print("Test-1")
print("------------------------")
for nb in nbest_preds1[given_id][:7]:
    print(nb)

print()
print("Test-2")
print("------------------------")
for nb in nbest_preds2[given_id][:7]:
    print(nb)

### 2.3. McNemar Test <a id="2-3"></a>

In [None]:
p_value = calculate_p_value(normalized_diffs1, normalized_diffs2)
print("p-value:", p_value)
p_value = np.round(p_value, decimals=4)
alpha = 0.05

if p_value < alpha:
    print(f"Significant differences between errors. The p value {p_value} is smaller than Threshold α = {alpha}.")
else:
    print(f"No significant differences between errors. The p value {p_value} is higher than Threshold α = {alpha}.")