In [1]:
import os
import json
import math
import numpy as np
import scipy.stats as stats

# from huggingface_hub import HfApi
# api = HfApi()
# def get_available_adapters():
#     # get all adapters from huggingface
#     all_models = api.list_models(author="AdapterHub",
#     library="adapter-transformers",
#     search="xlm-roberta-base-"
#     )
#
#     to_load = {
#         m.modelId: m.modelId.split("xlm-roberta-base-")[-1].rsplit("-wiki_pfeiffer", 1)[0]
#         for m in all_models
#         if m.modelId.startswith("AdapterHub/xlm-roberta-base-")
#            and m.modelId.endswith("-wiki_pfeiffer")
#     }
#     return to_load
# to_load = get_available_adapters()
# existing_adapters = list(to_load.values()) + ["eu", "sr"]
existing_adapters = [
    "th",
    "my",
    "hi",
    "ilo",
    "ht",
    "tr",
    "mi",
    "vi",
    "is",
    "it",
    "ta",
    "jv",
    "ja",
    "sw",
    "qu",
    "de",
    "el",
    "et",
    "ru",
    "gn",
    "id",
    "en",
    "ar",
    "es",
    "tk",
    "zh",
    "mhr",
    "cdo",
    "xmf",
    "eu",
    "sr",
]
scores = {"ner": {}, "pos": {}, "copa": {}, "qa": {}}
f1 = {"ner": "eval_f1", "copa": "eval_acc", "pos": "eval_f1_macro", "qa": "f1", "sib": "eval_accuracy"}
inf = math.inf
for file in os.listdir("../eval_scores/approximation"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/approximation", file), "r") as f:
                data = json.load(f)
                file_name = file.split(".")[0]
                els = file_name.split("_")
                # print(els)

                if len(els) == 3:
                    lang_name, task_name, option = els
                elif len(els) == 4:
                    lang_name, task_name, option, num = els

                if task_name == "sib":
                    continue
                if lang_name not in scores[task_name]:
                    scores[task_name][lang_name] = {}

                # we add the data to the scores dictionary:
                # if option == extended, we add everything
                # otherwise, we append the "option" to the keys and add those

                for key in data.keys():
                    if not option == "extended":
                        new_key = f"{key}_{option}"
                    else:
                        new_key = key
                    if "convergence" in new_key:
                        # we skip the convergence scores
                        # print(new_key, "not taking convergence scores")
                        continue
                    # print(key, new_key)
                    scores[task_name][lang_name][new_key] = data[key][f1[task_name]]

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# Adding finetune baseline
for file in os.listdir("../eval_scores/ft"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/ft", file), "r") as f:
                data = json.load(f)
                file_name = file.split(".")[0]
                els = file_name.split("_")
                if len(els) == 3:
                    lang_name, task_name, option = els

                if task_name == "sib":
                    continue
                if option == "eval":
                    option = "finetune"
                else:
                    option = "finetune_" + option
                if lang_name not in scores[task_name]:
                    scores[task_name][lang_name] = {}
                score = data["finetuned_model"][f1[task_name]]
                scores[task_name][lang_name][option] = score

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

# we make a subset consisting of only the languages for which an adapter exists
scores_subset = {}
no_adapter = {}
for task_name in scores:
    scores_subset[task_name] = {}
    no_adapter[task_name] = {}
    for lang_name in scores[task_name]:
        if lang_name in existing_adapters:
            scores_subset[task_name][lang_name] = scores[task_name][lang_name]
        else:
            no_adapter[task_name][lang_name] = scores[task_name][lang_name]
# adding convergence scores
for file in os.listdir("../eval_scores/convergence"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/convergence", file), "r") as f:
                data = json.load(f)
                file_name = file.split(".")[0]
                els = file_name.split("_")
                # print(els)

                if len(els) == 3:
                    lang_name, task_name, option = els
                elif len(els) == 4:
                    lang_name, task_name, option, num = els

                if lang_name not in scores[task_name]:
                    scores[task_name][lang_name] = {}

                # we add the data to the scores dictionary:
                # if option == extended, we add everything
                # otherwise, we append the "option" to the keys and add those

                for key in data.keys():
                    if not option == "extended":  # correctly format baseline names, "extended" == base case
                        new_key = f"{key}_{option}"
                    else:
                        new_key = key
                    # print(key, new_key)
                    new_key = f"{new_key}_convergence"
                    scores[task_name][lang_name][new_key] = data[key][f1[task_name]]

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)
scores["ner"]


def get_significance(task_name, option1, option2, alternative="two-sided", data=scores):
    all_scores1 = []
    all_scores2 = []
    if task_name == "all":
        task_names = data.keys()
    elif type(task_name) is str:
        task_names = [task_name]
    else:
        task_names = task_name
    for task_name in task_names:
        for lang_name in data[task_name]:
            if option1 in data[task_name][lang_name] and option2 in data[task_name][lang_name]:
                score1 = data[task_name][lang_name][option1]
                score2 = data[task_name][lang_name][option2]
                if task_name == "qa":
                    # we divide the scores of qa by 100
                    score1 = score1 / 100
                    score2 = score2 / 100
                all_scores1.append(score1)
                all_scores2.append(score2)
    print("average scores")
    print(f"{option1}: {np.mean(all_scores1)}")
    print(f"{option2}: {np.mean(all_scores2)}")
    t_stat, p_val = stats.ttest_rel(all_scores1, all_scores2, alternative=alternative)
    print(f"t-statistic: {t_stat}")
    print(f"p-value: {p_val}")
    return t_stat, p_val

KeyError: nl_qa_test_squad_nl.json
KeyError: nl_qa_test_squad_nl_bis.json
KeyError: nl_qa_test_squad_nl_bis_bis.json
KeyError: ace_sib_Arab_eval.json
KeyError: ace_sib_Latn_eval.json
KeyError: acm_sib_Arab_eval.json
KeyError: acq_sib_Arab_eval.json
KeyError: aeb_sib_Arab_eval.json
KeyError: af_sib_Latn_eval.json
KeyError: ajp_sib_Arab_eval.json
KeyError: ak_sib_Latn_eval.json
KeyError: als_sib_Latn_eval.json
KeyError: am_sib_Ethi_eval.json
KeyError: apc_sib_Arab_eval.json
KeyError: arb_sib_Arab_eval.json
KeyError: arb_sib_Latn_eval.json
KeyError: ars_sib_Arab_eval.json
KeyError: ary_sib_Arab_eval.json
KeyError: arz_sib_Arab_eval.json
KeyError: ast_sib_Latn_eval.json
KeyError: as_sib_Beng_eval.json
KeyError: awa_sib_Deva_eval.json
KeyError: ayr_sib_Latn_eval.json
KeyError: azb_sib_Arab_eval.json
KeyError: azj_sib_Latn_eval.json
KeyError: ban_sib_Latn_eval.json
KeyError: ba_sib_Cyrl_eval.json
KeyError: bem_sib_Latn_eval.json
KeyError: be_sib_Cyrl_eval.json
KeyError: bg_sib_Cyrl_eval.json

In [2]:
# we calculate averages of all columns
averages = {}
for task_name in scores:
    averages[task_name] = {}
    for lang_name in scores[task_name]:
        for key in scores[task_name][lang_name]:
            if key not in averages[task_name]:
                averages[task_name][key] = [scores[task_name][lang_name][key], 1]
            else:
                averages[task_name][key][0] += scores[task_name][lang_name][key]
                averages[task_name][key][1] += 1
    averages[task_name] = {key: value[0] / value[1] for key, value in averages[task_name].items()}

In [3]:
# we compare the averages of each key <> key_converged
better = {}
worse = {}
for task in averages:
    better[task] = []
    worse[task] = []
    keys = averages[task].keys()
    for key in averages[task]:
        conv = key + "_convergence"
        if not key.endswith("convergence") and conv in keys:
            key_score = averages[task][key]
            conv_score = averages[task][conv]
            diff = conv_score - key_score
            if key_score > conv_score:
                worse[task].append((key, diff))
            else:
                better[task].append((key, diff))
print("outperform:", better)
print("NO outperform:", worse)

outperform: {'ner': [('reconstructed_featural_base', 0.0), ('reconstructed_syntactic_distVar', 0.0), ('reconstructed_morphological_distVar', 0.0), ('reconstructed_featural_eu', 0.0), ('reconstructed_featural', 0.0), ('baseline_en', 0.0), ('baseline_task_adapter', 0.0), ('baseline_avg_adapter', 0.0), ('baseline_closest_featural', 0.0), ('no_train_gain', 0.0), ('reconstructed_featural_limit', 0.0), ('reconstructed_syntactic_limit', 0.0), ('reconstructed_morphological_limit', 0.0), ('reconstructed_featural_sr', 0.0), ('reconstructed_featural_threshold', 0.0), ('reconstructed_syntactic_threshold', 0.0), ('reconstructed_morphological_threshold', 0.0)], 'pos': [('reconstructed_featural_base', 0.0), ('reconstructed_featural_eu', 0.0), ('reconstructed_featural', 0.0), ('baseline_en', 0.0), ('baseline_task_adapter', 0.0), ('baseline_avg_adapter', 0.0), ('baseline_closest_featural', 0.0), ('no_train_gain', 0.0), ('reconstructed_featural_sr', 0.0), ('reconstructed_featural_threshold', 0.0), ('rec

In [4]:
# for those in better, we check if they are significantly better.
# for those in worse, if they are significantly worse
tasks = ["ner", "pos", "copa", "qa"]
for task in tasks:
    for key, diff in better[task]:
        print(f"{task} {key} outperforming {key}_convergence")
        t_stat, p_val = get_significance(task, key, key + "_convergence", alternative="less")
        if p_val < 0.05:
            print("The difference IS statistically significant")
        else:
            print("The difference is not statistically significant")
        print("\n")
    for key, diff in worse[task]:
        print(f"{task} {key} UNDERPERFORMING {key}_convergence")
        t_stat, p_val = get_significance(task, key, key + "_convergence", alternative="greater")
        if p_val < 0.05:
            print("The difference IS statistically significant")
        else:
            print("The difference is not statistically significant")
        print("\n")
    print("\n~\n")

ner reconstructed_featural_base outperforming reconstructed_featural_base_convergence
average scores
reconstructed_featural_base: 0.5041674016886359
reconstructed_featural_base_convergence: 0.5041674016886359
t-statistic: nan
p-value: nan
The difference is not statistically significant


ner reconstructed_syntactic_distVar outperforming reconstructed_syntactic_distVar_convergence
average scores
reconstructed_syntactic_distVar: 0.4972654118951448
reconstructed_syntactic_distVar_convergence: 0.4972654118951448
t-statistic: nan
p-value: nan
The difference is not statistically significant


ner reconstructed_morphological_distVar outperforming reconstructed_morphological_distVar_convergence
average scores
reconstructed_morphological_distVar: 0.49784227832341155
reconstructed_morphological_distVar_convergence: 0.49784227832341155
t-statistic: nan
p-value: nan
The difference is not statistically significant


ner reconstructed_featural_eu outperforming reconstructed_featural_eu_convergence
a

  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero


In [5]:
# check if "convergence" outperforms relative non-convergence scores
tasks = ["ner", "pos", "copa", "qa"]
test_columns = [
    "reconstructed_featural",
    "baseline_en",
    "baseline_avg_adapter",
    "baseline_task_adapter",
    "baseline_closest_featural",
    "no_train_gain",
    "finetune",
]
significant_more = {}
significant_less = {}
for task in tasks:
    for test_column in test_columns:
        print(f"{task} outperforming {test_column}")
        t_stat, p_val = get_significance(task, test_column, test_column + "_convergence", alternative="less")
        if p_val < 0.05:
            print("The difference is statistically significant")
            if test_column not in significant_more:
                significant_more[test_column] = {}
            significant_more[test_column][task] = (
                averages[task][test_column],
                averages[task][test_column + "_convergence"],
            )
        else:
            print("The difference is not statistically significant")
        print("\n")
        print(f"{test_column} outperforming {task}")
        t_stat, p_val = get_significance(task, test_column, test_column + "_convergence", alternative="greater")
        if p_val < 0.05:
            print("The difference is statistically significant")
            if test_column not in significant_less:
                significant_less[test_column] = {}
            significant_less[test_column][task] = (
                averages[task][test_column],
                averages[task][test_column + "_convergence"],
            )
        else:
            print("The difference is not statistically significant")
        print("-----------------------------------")

ner outperforming reconstructed_featural
average scores
reconstructed_featural: 0.5013399275268126
reconstructed_featural_convergence: 0.5013399275268126
t-statistic: nan
p-value: nan
The difference is not statistically significant


reconstructed_featural outperforming ner
average scores
reconstructed_featural: 0.5013399275268126
reconstructed_featural_convergence: 0.5013399275268126
t-statistic: nan
p-value: nan
The difference is not statistically significant
-----------------------------------
ner outperforming baseline_en
average scores
baseline_en: 0.45986180313931335
baseline_en_convergence: 0.45986180313931335
t-statistic: nan
p-value: nan
The difference is not statistically significant


baseline_en outperforming ner
average scores
baseline_en: 0.45986180313931335
baseline_en_convergence: 0.45986180313931335
t-statistic: nan
p-value: nan
The difference is not statistically significant
-----------------------------------
ner outperforming baseline_avg_adapter
average scores
base

# Results
NER
	• Approximation & Baselines (ADAPTER)
		○ Better in all cases, Statistically significant
			§ all over the line (1 exception out of 16)
	• Finetune
		○ Worse, statistically significant but small difference
POS
	• Approximation
		○ No significantly worse for convergence, very small differences
	• Baselines
		○ Better
			§ Significant
				□ En adapter, MAD-X (closest featural) setting
			§ Not significant: no-train-gain
	• Finetune
		○ Better, statistically significant
COPA
	• Significantly worse all over the line!
	• --> Reason: very small dataset, 
		○ if need for eval (convergence stopping criterium), then performance further decreases
QA
	• Significantly better all over the line!
	• Training cost

## Conclusion: fair comparison
NER: take convergence scores to maintain comparibility in the setting (only small difference for finetune)
POS: convergence for both adapter and finetune, our method underperforms but not significantly
COPA: Take non-convergence all over the line
QA: convergence all over the line

In [6]:
# We make a new directory under eval_scores with the results that we will use for further eval
if not os.path.exists("../eval_scores/selected"):
    os.makedirs("../eval_scores/selected")
task2selection = {"ner": "convergence", "pos": "convergence", "copa": "no_convergence", "qa": "convergence"}
scores_selected = {}
for task_name in task2selection:
    scores_selected[task_name] = {}
    for lang_name in scores[task_name]:
        if lang_name not in scores_selected[task_name]:
            scores_selected[task_name][lang_name] = {}
        if task2selection[task_name] == "convergence":
            for key in scores[task_name][lang_name]:
                if key.endswith("convergence"):
                    new_key = key.split("_convergence")[0]
                    if not task_name == "qa":
                        scores_selected[task_name][lang_name][new_key] = scores[task_name][lang_name][key]
                    else:
                        # we divide the scores of qa by 100
                        scores_selected[task_name][lang_name][new_key] = scores[task_name][lang_name][key] / 100
        else:  # no convergence: copa
            for key in scores[task_name][lang_name]:
                if not key.endswith("convergence"):
                    scores_selected[task_name][lang_name][key] = scores[task_name][lang_name][key]
    with open(f"../eval_scores/selected/{task_name}.json", "w") as f:
        json.dump(scores_selected[task_name], f, indent=4)

In [7]:
# we make a similar file for the sib scores, added at a later stage
sib_scores = {}
for file in os.listdir("../eval_scores/approximation"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/approximation", file), "r") as f:
                data = json.load(f)
                file_name = file.split(".")[0]
                els = file_name.split("_")
                # print(els)

                if len(els) == 3:
                    continue
                elif len(els) == 4:
                    lang_name, task_name, script, option = els
                elif len(els) == 5:
                    lang_name, task_name, script, option, num = els

                if not task_name == "sib":
                    continue
                if lang_name not in sib_scores.keys():
                    sib_scores[lang_name] = {}

                # we add the data to the scores dictionary:
                # if option == extended, we add everything
                # otherwise, we append the "option" to the keys and add those

                for key in data.keys():
                    if not option == "extended":
                        new_key = f"{key}_{script}_{option}"
                    else:
                        new_key = f"{key}_{script}"
                    # print(key, new_key)
                    sib_scores[lang_name][new_key] = data[key][f1[task_name]]

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

In [8]:
# Adding finetune baseline
for file in os.listdir("../eval_scores/ft"):
    if file.endswith(".json"):
        try:
            with open(os.path.join("../eval_scores/ft", file), "r") as f:
                data = json.load(f)
                file_name = file.split(".")[0]
                els = file_name.split("_")
                if len(els) == 3:
                    lang_name, task_name, option = els
                elif len(els) == 4:
                    lang_name, task_name, script, option = els
                if not task_name == "sib":
                    continue
                if option == "eval":
                    option = "finetune" + "_" + script
                else:
                    option = "finetune_" + option + "_" + script
                if lang_name not in sib_scores:
                    sib_scores[lang_name] = {}

                score = data["finetuned_model"][f1[task_name]]
                sib_scores[lang_name][option] = score

        except json.JSONDecodeError:
            print(f"Error decoding JSON for file: {file}")
        except KeyError:
            print("KeyError:", file)

In [16]:
import re

one_per_lang_sib = {}
all_sib_scores = {}
lang2script = {}
# if there are multiple scripts for a language, we simply average over them. if not, we take the value
for lang, keys in sib_scores.items():
    one_per_lang_sib[lang] = {}
    all_sib_scores[lang] = {}
    scripts = [key.split("_")[1] for key in keys if key.startswith("finetune")]
    lang2script[lang] = scripts
    if len(scripts) == 1:
        # we remove all f"_{script}" from the keys and put them in one_per_lang_sib[lang]
        script = scripts[0]
        pattern = f"_{script}"
        lang2script[lang] = scripts

        for key, value in keys.items():
            new_key = re.sub(pattern, "", key)
            # print(f"{key} -> {new_key}")
            one_per_lang_sib[lang][new_key] = value
    else:
        pattern = f"_({'|'.join(scripts)})"
        for key, value in keys.items():
            new_key = re.sub(pattern, "", key)
            if new_key not in one_per_lang_sib[lang]:
                one_per_lang_sib[lang][new_key] = [value, 1]
            else:
                one_per_lang_sib[lang][new_key][0] += value
                one_per_lang_sib[lang][new_key][1] += 1
        # we average the values
        for key in one_per_lang_sib[lang]:
            if type(one_per_lang_sib[lang][key]) is list:
                one_per_lang_sib[lang][key] = one_per_lang_sib[lang][key][0] / one_per_lang_sib[lang][key][1]
            else:
                one_per_lang_sib[lang][key] = one_per_lang_sib[lang][key]
    for script in scripts:
        all_sib_scores[lang][script] = {}
        pattern = f"_{script}"
        for key, value in keys.items():
            # we do not consider those that do not have the pattern
            if pattern not in key:
                continue
            new_key = re.sub(pattern, "", key)
            all_sib_scores[lang][script][new_key] = value

# we save the one_per_lang_sib to a json file
with open("../eval_scores/selected/sib.json", "w") as f:
    json.dump(one_per_lang_sib, f, indent=4)
with open("../eval_scores/all_sib/sib.json", "w") as f:
    json.dump(all_sib_scores, f, indent=4)

In [17]:
all_sib_scores

{'ace': {'Arab': {'reconstructed_featural_base': 0.16176470588235295,
   'reconstructed_syntactic_distVar': 0.1715686274509804,
   'reconstructed_morphological_distVar': 0.1715686274509804,
   'reconstructed_featural_eu': 0.1715686274509804,
   'reconstructed_featural': 0.1715686274509804,
   'baseline_en': 0.11274509803921569,
   'baseline_task_adapter': 0.17647058823529413,
   'baseline_avg_adapter': 0.10294117647058823,
   'baseline_closest_featural': 0.12254901960784313,
   'no_train_gain': 0.14215686274509803,
   'reconstructed_featural_limit': 0.14705882352941177,
   'reconstructed_morphological_limit': 0.14215686274509803,
   'reconstructed_syntactic_limit': 0.16176470588235295,
   'reconstructed_featural_sr': 0.1715686274509804,
   'reconstructed_featural_threshold': 0.16666666666666666,
   'reconstructed_morphological_threshold': 0.15196078431372548,
   'reconstructed_syntactic_threshold': 0.16666666666666666,
   'finetune': 0.18137254901960784},
  'Latn': {'reconstructed_feat

In [11]:
"_".join(["One element"])

'One element'