In [1]:
import os
import json
import math
import numpy as np
import scipy.stats as stats

inf = math.inf

existing_adapters = [
    "th",
    "my",
    "hi",
    "ilo",
    "ht",
    "tr",
    "mi",
    "vi",
    "is",
    "it",
    "ta",
    "jv",
    "ja",
    "sw",
    "qu",
    "de",
    "el",
    "et",
    "ru",
    "gn",
    "id",
    "en",
    "ar",
    "es",
    "tk",
    "zh",
    "mhr",
    "cdo",
    "xmf",
    "eu",
    "sr",
]
scores = {"ner": {}, "pos": {}, "copa": {}, "qa": {}}
f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1"}

for file in os.listdir("../eval_scores/approximation"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/approximation", file), "r") as f:
                data = json.load(f)
                file_name = file.split(".")[0]
                els = file_name.split("_")
                # print(els)

                if len(els) == 3:
                    lang_name, task_name, option = els
                elif len(els) == 4:
                    lang_name, task_name, option, num = els
                else:
                    raise ValueError

                if lang_name not in scores[task_name]:
                    scores[task_name][lang_name] = {}

                # we add the data to the scores dictionary:
                # if option == extended, we add everything
                # otherwise, we append the "option" to the keys and add those

                for key in data.keys():
                    if not option == "extended":
                        new_key = f"{key}_{option}"
                    else:
                        new_key = key
                    if "convergence" in new_key:
                        # we skip the convergence scores
                        # print(new_key, "not taking convergence scores")
                        continue
                    # print(key, new_key)
                    scores[task_name][lang_name][new_key] = data[key][f1[task_name]]

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# Adding finetune baseline
for file in os.listdir("../eval_scores/ft"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/ft", file), "r") as f:
                data = json.load(f)
                file_name = file.split(".")[0]
                els = file_name.split("_")
                if len(els) == 3:
                    lang_name, task_name, option = els
                else:
                    raise ValueError
                if option == "eval":
                    option = "finetune"
                else:
                    option = "finetune_" + option
                if lang_name not in scores[task_name]:
                    scores[task_name][lang_name] = {}
                score = data["finetuned_model"][f1[task_name]]
                scores[task_name][lang_name][option] = score

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# we make a subset consisting of only the languages for which an adapter exists
scores_subset = {}
no_adapter = {}
for task_name in scores:
    scores_subset[task_name] = {}
    no_adapter[task_name] = {}
    for lang_name in scores[task_name]:
        if lang_name in existing_adapters:
            scores_subset[task_name][lang_name] = scores[task_name][lang_name]
        else:
            no_adapter[task_name][lang_name] = scores[task_name][lang_name]
# adding convergence scores
for file in os.listdir("../eval_scores/convergence"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/convergence", file), "r") as f:
                data = json.load(f)
                file_name = file.split(".")[0]
                els = file_name.split("_")
                # print(els)

                if len(els) == 3:
                    lang_name, task_name, option = els
                elif len(els) == 4:
                    lang_name, task_name, option, num = els
                else:
                    raise ValueError

                if lang_name not in scores[task_name]:
                    scores[task_name][lang_name] = {}

                # we add the data to the scores dictionary:
                # if option == extended, we add everything
                # otherwise, we append the "option" to the keys and add those

                for key in data.keys():
                    if not option == "extended":  # correctly format baseline names, "extended" == base case
                        new_key = f"{key}_{option}"
                    else:
                        new_key = key
                    # print(key, new_key)
                    new_key = f"{new_key}_convergence"
                    scores[task_name][lang_name][new_key] = data[key][f1[task_name]]

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)


def get_significance(task_name, option1, option2, alternative="two-sided", data=scores):
    all_scores1 = []
    all_scores2 = []
    if task_name == "all":
        task_names = data.keys()
    elif type(task_name) is str:
        task_names = [task_name]
    else:
        task_names = task_name
    for task_name in task_names:
        for lang_name in data[task_name]:
            if option1 in data[task_name][lang_name] and option2 in data[task_name][lang_name]:
                score1 = data[task_name][lang_name][option1]
                score2 = data[task_name][lang_name][option2]
                if task_name == "qa":
                    # we divide the scores of qa by 100
                    score1 = score1 / 100
                    score2 = score2 / 100
                all_scores1.append(score1)
                all_scores2.append(score2)
    print("average scores")
    print(f"{option1}: {np.mean(all_scores1)}")
    print(f"{option2}: {np.mean(all_scores2)}")
    t_stat, p_val = stats.ttest_rel(all_scores1, all_scores2, alternative=alternative)
    print(f"t-statistic: {t_stat}")
    print(f"p-value: {p_val}")
    return t_stat, p_val

In [2]:
# we calculate averages of all columns
averages = {}
for task_name in scores:
    averages[task_name] = {}
    for lang_name in scores[task_name]:
        for key in scores[task_name][lang_name]:
            if key not in averages[task_name]:
                averages[task_name][key] = [scores[task_name][lang_name][key], 1]
            else:
                averages[task_name][key][0] += scores[task_name][lang_name][key]
                averages[task_name][key][1] += 1
    averages[task_name] = {key: value[0] / value[1] for key, value in averages[task_name].items()}

In [18]:
# we compare the averages of each key <> key_converged
better = {}
worse = {}
for task in averages:
    better[task] = []
    worse[task] = []
    keys = averages[task].keys()
    for key in averages[task]:
        conv = key + "_convergence"
        if not key.endswith("convergence") and conv in keys:
            key_score = averages[task][key]
            conv_score = averages[task][conv]
            diff = conv_score - key_score
            if key_score > conv_score:
                worse[task].append((key, diff))
            else:
                better[task].append((key, diff))
print("outperform:", better)
print("worse performance:", worse)

outperform: {'ner': [('reconstructed_featural_base', 0.011284744079776532), ('reconstructed_syntactic_distVar', 0.003084054049485918), ('reconstructed_morphological_distVar', 0.004253261276940423), ('reconstructed_featural_eu', 0.009887806924539833), ('reconstructed_featural', 0.009779546031709385), ('baseline_en', 0.0146985244119342), ('baseline_avg_adapter', 0.011671328275132009), ('baseline_closest_featural', 0.00380655761073162), ('no_train_gain', 0.009286212378280423), ('reconstructed_featural_limit', 0.011702584902354873), ('reconstructed_syntactic_limit', 0.00978524531481062), ('reconstructed_morphological_limit', 0.007568609404203452), ('reconstructed_featural_sr', 0.009960298062257955), ('reconstructed_featural_threshold', 0.01401787579197411), ('reconstructed_syntactic_threshold', 0.015380781322165915), ('reconstructed_morphological_threshold', 0.013128297374519149)], 'pos': [('reconstructed_featural_eu', 0.0012114120443494913), ('baseline_en', 0.005498314720510156), ('baseli

In [25]:
# for those in better, we check if they are significantly better.
# for those in worse, if they are significantly worse
tasks = ["ner", "pos", "copa", "qa"]
for task in tasks:
    for key, diff in better[task]:
        print(f"{task} {key} outperforming {key}_convergence")
        t_stat, p_val = get_significance(task, key, key + "_convergence", alternative="less")
        if p_val < 0.05:
            print("The difference IS statistically significant")
        else:
            print("The difference is not statistically significant")
        print("\n")
    for key, diff in worse[task]:
        print(f"{task} {key} UNDERPERFORMING {key}_convergence")
        t_stat, p_val = get_significance(task, key, key + "_convergence", alternative="greater")
        if p_val < 0.05:
            print("The difference IS statistically significant")
        else:
            print("The difference is not statistically significant")
        print("\n")
    print("\n~\n")

ner reconstructed_featural_base outperforming reconstructed_featural_base_convergence
average scores
reconstructed_featural_base: 0.4928826576088594
reconstructed_featural_base_convergence: 0.5041674016886359
t-statistic: -3.6854645006532025
p-value: 0.0001647721730451718
The difference IS statistically significant


ner reconstructed_syntactic_distVar outperforming reconstructed_syntactic_distVar_convergence
average scores
reconstructed_syntactic_distVar: 0.49418135784565886
reconstructed_syntactic_distVar_convergence: 0.4972654118951448
t-statistic: -0.852656751273577
p-value: 0.19809547394429444
The difference is not statistically significant


ner reconstructed_morphological_distVar outperforming reconstructed_morphological_distVar_convergence
average scores
reconstructed_morphological_distVar: 0.493589017046471
reconstructed_morphological_distVar_convergence: 0.49784227832341155
t-statistic: -1.1599823955718915
p-value: 0.12461471394682487
The difference is not statistically signi

In [3]:
# check if "convergence" outperforms relative non-convergence scores
tasks = ["ner", "pos", "copa", "qa"]
test_columns = [
    "reconstructed_featural",
    "baseline_en",
    "baseline_avg_adapter",
    "baseline_task_adapter",
    "baseline_closest_featural",
    "no_train_gain",
    "finetune",
]
significant_more = {}
significant_less = {}
for task in tasks:
    for test_column in test_columns:
        print(f"{task} outperforming {test_column}")
        t_stat, p_val = get_significance(task, test_column, test_column + "_convergence", alternative="less")
        if p_val < 0.05:
            print("The difference is statistically significant")
            if test_column not in significant_more:
                significant_more[test_column] = {}
            significant_more[test_column][task] = (
                averages[task][test_column],
                averages[task][test_column + "_convergence"],
            )
        else:
            print("The difference is not statistically significant")
        print("\n")
        print(f"{test_column} outperforming {task}")
        t_stat, p_val = get_significance(task, test_column, test_column + "_convergence", alternative="greater")
        if p_val < 0.05:
            print("The difference is statistically significant")
            if test_column not in significant_less:
                significant_less[test_column] = {}
            significant_less[test_column][task] = (
                averages[task][test_column],
                averages[task][test_column + "_convergence"],
            )
        else:
            print("The difference is not statistically significant")
        print("-----------------------------------")

ner outperforming reconstructed_featural
average scores
reconstructed_featural: 0.4918581901585807
reconstructed_featural_convergence: 0.5005181511125516
t-statistic: -2.694002897180043
p-value: 0.003992245237513158
The difference is statistically significant


reconstructed_featural outperforming ner
average scores
reconstructed_featural: 0.4918581901585807
reconstructed_featural_convergence: 0.5005181511125516
t-statistic: -2.694002897180043
p-value: 0.9960077547624868
The difference is not statistically significant
-----------------------------------
ner outperforming baseline_en
average scores
baseline_en: 0.4468458284277574
baseline_en_convergence: 0.4594860792847517
t-statistic: -3.5758319885964442
p-value: 0.0002448773786074038
The difference is statistically significant


baseline_en outperforming ner
average scores
baseline_en: 0.4468458284277574
baseline_en_convergence: 0.4594860792847517
t-statistic: -3.5758319885964442
p-value: 0.9997551226213927
The difference is not stati

# Results
NER
	• Approximation & Baselines (ADAPTER)
		○ Better in all cases, Statistically significant
			§ all over the line (1 exception out of 16)
	• Finetune
		○ Worse, statistically significant but small difference
POS
	• Approximation
		○ No significantly worse for convergence, very small differences
	• Baselines
		○ Better
			§ Significant
				□ En adapter, MAD-X (closest featural) setting
			§ Not significant: no-train-gain
	• Finetune
		○ Better, statistically significant
COPA
	• Significantly worse all over the line!
	• --> Reason: very small dataset, 
		○ if need for eval (convergence stopping criterium), then performance further decreases
QA
	• Significantly better all over the line!
	• Training cost

## Conclusion: fair comparison
NER: take convergence scores to maintain comparibility in the setting (only small difference for finetune)
POS: convergence for both adapter and finetune, our method underperforms but not significantly
COPA: Take non-convergence all over the line
QA: convergence all over the line

In [32]:
# We make a new directory under eval_scores with the results that we will use for further eval
if not os.path.exists("../eval_scores/selected"):
    os.makedirs("../eval_scores/selected")
task2selection = {"ner": "convergence", "pos": "convergence", "copa": "no_convergence", "qa": "convergence"}
scores_selected = {}
for task_name in task2selection:
    scores_selected[task_name] = {}
    for lang_name in scores[task_name]:
        if lang_name not in scores_selected[task_name]:
            scores_selected[task_name][lang_name] = {}
        if task2selection[task_name] == "convergence":
            for key in scores[task_name][lang_name]:
                if key.endswith("convergence"):
                    new_key = key.split("_convergence")[0]
                    if not task_name == "qa":
                        scores_selected[task_name][lang_name][new_key] = scores[task_name][lang_name][key]
                    else:
                        # we divide the scores of qa by 100
                        scores_selected[task_name][lang_name][new_key] = scores[task_name][lang_name][key] / 100
        else:  # no convergence: copa
            for key in scores[task_name][lang_name]:
                if not key.endswith("convergence"):
                    scores_selected[task_name][lang_name][key] = scores[task_name][lang_name][key]
    with open(f"../eval_scores/selected/{task_name}.json", "w") as f:
        json.dump(scores_selected[task_name], f, indent=4)